From 0adc241e33f90382bd66adc55f34518af33802d1 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Tue, 11 Nov 2025 08:08:32 -0700 Subject: [PATCH 01/18] Refactors CustomSchedule.py for scale out of schedules (#2574) ## Motivation We are about to land a significant number of custom schedules for various tile sizes and transpose orientations. By using the convention "a schedule is described in a function" we can reasonably scale the code out, instead of a chain of if/elif blocks that spans hundreds or thousands of lines. Initially we keep the existing schedules in functions inside CustomSchedule.py, but we can refactor as needed. ## Technical Details ## Test Plan TENSILELITE_CLIENT_ARGS="--build-type Debug --gpu-targets gfx950 --clean" tox -e py39 -- Tensile/Tests -k custom_mainloop_scheduling.yaml ## Test Result --- .../Tensile/Components/CustomSchedule.py | 491 +++++++++--------- 1 file changed, 250 insertions(+), 241 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 25803e45345..ed7d59e2efc 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -248,6 +248,253 @@ def scheduleInst2(instList, macroGuard=""): return module, numCodePath +def isNN(kernel): + return not kernel["ProblemType"]["TransposeA"] and not kernel["ProblemType"]["TransposeB"] + +def isNT(kernel): + return not kernel["ProblemType"]["TransposeA"] and kernel["ProblemType"]["TransposeB"] + +def isTT(kernel): + return kernel["ProblemType"]["TransposeA"] and kernel["ProblemType"]["TransposeB"] + +def isTN(kernel): + return kernel["ProblemType"]["TransposeA"] and not kernel["ProblemType"]["TransposeB"] + +def _get_schedule_256x256x64_16bit(kernel, useLDSTr, TLDS): + kernel["MfmaInitCVgprs"] = True + + optSchedule = dict() + syncCode = [] + + if isTN(kernel) and TLDS == 1: + optSchedule = { + 'SYNC' : [[19,20, 50,51, 67,68, 104, 105]], + 'GRIncA' : [[0,1,2,3,4,5,6,7,8]], + 'GRIncB' : [[9,10,11,12,13,14,15,16,17]], + 'LRA0' : [[0,2,4,6,8,10,12,14], + [1,3,5,7,9,11,13,15]], + 'LRB0' : [[24,27,30,33,36,38,40,42], + [22,25,28,31,34,37,39,41]], + 'GRA' : [[21,22, 23,25, 26,28, 29,31, 32,34, 35,52, 53,55, 56,58], + [21,23, 24,26, 27,29, 30,32, 33,35, 36,53, 54,56, 57,59]], + 'GRB' : [[59,61, 62,64, 65,85, 86,87, 88,89, 94,96, 98,100, 102,124], + [60,62, 63,65, 66,84, 85,86, 87,88, 93,95, 97,99, 103,123]], + 'LRA1' : [[69,71,73,75,77,79,81,83], + [70,72,74,76,78,80,82,90]], + 'LRB1' : [[106,108,110,112,114,116,118,120], + [107,109,111,113,115,117,119,121]], + 'LRSA' : [[16]], + 'LRSB' : [[83]], + 'LWSA' : [[125]], + 'LWSB' : [[125]], + 'LCC' : [[126, 126]], + } + syncCode = [SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=(2 + 8 + 8), vscnt=-1, comment="Wait for previous GRA to completely"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=15, vscnt=-1, comment="Wait for previous GRA to completely"), + SBarrier(comment="")] + elif isNT(kernel) and not useLDSTr and TLDS == 0: + kernel["UsePLRPack"] = True + + optSchedule = { + 'SYNC' : [[12,13, 36,44, 56,59, 66,68, 73,92]], + 'GRIncA' : [[0,1,2,3,4,5,6,7,8]], + 'GRIncB' : [[28,29,30,31,32,33,34,35,36]], + 'LRA0' : [[0,0,2,2,4,4,6,6], + [1,1,3,3,5,5,7,7]], + 'LRB0' : [[8,8,10,10,15,15,18,18], + [9,9,11,11,14,14,17,17]], + 'GRA' : [[14,14, 17,17, 20,20, 23,23, 26,26, 45,45, 48,48, 51,51], + [15,15, 18,18, 21,21, 24,24, 27,27, 46,46, 49,49, 52,52]], + 'GRB' : [[54,54, 57,57, 87,87,90,90,93,93,96,96,99,99, 123,123], + [55,55, 58,58, 88,88,91,91,94,94,97,97,100,100, 124,124]], + 'LRA1' : [[60,60,62,62,64,64,66,66], + [61,61,63,63,65,65,67,67]], + 'LRB1' : [[69,69,71,71,73,73,75,75], + [70,70,72,72,74,74,76,76]], + 'LRSA' : [[59]], + 'LRSB' : [[59]], + 'LWSA' : [[125]], + 'LWSB' : [[125]], + 'LCC' : [[126, 126]], + 'PackA0' : [[16,16, 19,19, 21,21, 22,22, 24,24, 25,25, 27,27, 28,28, 29,29, 30,30, 31,31, 32,32, 33,33, 34,34, 35,35, 36,36], + [16,16, 19,19, 20,20, 22,22, 23,23, 25,25, 26,26, 28,28, 29,29, 30,30, 31,31, 32,32, 33,33, 34,34, 35,35, 36,36]], + 'PackB0' : [[37,37, 38,38, 39,39, 40,40, 41,41, 42,42, 43,43, 46,46, 47,47, 49,49, 50,50, 52,52, 53,53, 55,55, 56,56, 58,58], + [37,37, 38,38, 39,39, 40,40, 41,41, 42,42, 43,43, 45,45, 47,47, 48,48, 50,50, 51,51, 53,53, 54,54, 56,56, 57,57]], + 'PackA1' : [[74,74, 76,76, 77,77, 78,78, 79,79, 80,80, 81,81, 82,82, 83,83, 84,84, 85,85, 86,86, 88,88, 89,89, 91,91, 92,92], + [75,75, 77,77, 78,78, 79,79, 80,80, 81,81, 82,82, 83,83, 84,84, 85,85, 86,86, 87,87, 89,89, 90,90, 92,92, 93,93]], + 'PackB1' : [[94,94, 95,95, 97,97, 98,98, 100,100, 101,101, 102,102, 103,103, 104,104, 105,105, 106,106, 107,107, 108,108, 109,109, 110,110, 111,111], + [95,95, 96,96, 98,98, 99,99, 101,101, 102,102, 103,103, 104,104, 105,105, 106,106, 107,107, 108,108, 109,109, 110,110, 111,111, 112,112]], + } + syncCode = [SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=17, vscnt=-1, comment="Wait for GRA to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=9, vscnt=-1, comment="Wait for GRB to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA1 to complete"), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB1 to complete")] + elif (isNN(kernel) or isTT(kernel)) and not useLDSTr and TLDS == 1: + kernel["UsePLRPack"] = True + + optSchedule = { + 'SYNC' : [[8, 12,13, 36,44, 56,59, 66,68, 73]], + 'GRIncA' : [[0,1,2,3,4,5,6,7,8]], + 'GRIncB' : [[28,29,30,31,32,33,34,35,36]], + 'LRA0' : [[0,0,2,2,4,4,6,6], + [1,1,3,3,5,5,7,7]], + 'LRB0' : [[9,11, 15,18,21,24,27,30], + [10,12, 14,17,20,23,26,29]], + 'GRA' : [[14,14, 17,17, 20,20, 23,23, 26,26, 45,45, 48,48, 51,51], + [15,15, 18,18, 21,21, 24,24, 27,27, 46,46, 49,49, 52,52]], + 'GRB' : [[54,54, 57,57, 87,87,90,90,93,93,96,96,99,99, 123,123], + [55,55, 58,58, 88,88,91,91,94,94,97,97,100,100, 124,124]], + 'LRA1' : [[60,60,62,62,64,64,66,66], + [61,61,63,63,65,65,67,67]], + 'LRB1' : [[68,70,72,74,76,78,80,82], + [69,71,73,75,77,79,81,83]], + 'LRSA' : [[59]], + 'LRSB' : [[59]], + 'LWSA' : [[125]], + 'LWSB' : [[125]], + 'LCC' : [[126, 126]], + 'PackA0' : [[8,8, 16,16, 19,19, 22,22, 25,25, 28,28, 29,29, 31,31, 32,32, 33,33, 34,34, 35,35, 36,36, 37,37, 38,38, 39,39]], + 'PackA1' : [[75,75, 77,77, 79,79, 81,81, 83,83, 84,84, 85,85, 86,86, 88,88, 89,89, 91,91, 92,92, 94,94, 95,95, 97,97, 98,98], + [74,74, 76,76, 78,78, 80,80, 82,82, 84,84, 85,85, 86,86, 87,87, 89,89, 90,90, 92,92, 93,93, 95,95, 96,96, 98,98]], + } + syncCode = [SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 first half to complete"), + SWaitCnt(dscnt=1, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=17, vscnt=-1, comment="Wait for GRA to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=9, vscnt=-1, comment="Wait for GRB to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA1 to complete")] + if isTT(kernel): + kernel["SwapGlobalReadOrder"] = True + + optSchedule['GRIncA'], optSchedule['GRIncB'] = optSchedule['GRIncB'], optSchedule['GRIncA'] + optSchedule['LRA0'], optSchedule['LRB0'] = optSchedule['LRB0'], optSchedule['LRA0'] + optSchedule['LRA1'], optSchedule['LRB1'] = optSchedule['LRB1'], optSchedule['LRA1'] + optSchedule['PackB0'] = optSchedule['PackA0'] + optSchedule['PackB1'] = optSchedule['PackA1'] + del optSchedule['PackA0'], optSchedule['PackA1'] + else: + return False, None + + + numMfma = 128 + opt1 = ScheduleInfo(2, numMfma, optSchedule, syncCode) + return True, opt1 + +def _get_schedule_256x256x128_8bit(kernel, useLDSTr, TLDS): + kernel["MfmaInitCVgprs"] = True + + optSchedule = dict() + syncCode = [] + + plr = 3 if kernel["ForceUnrollSubIter"] else 1 + + if isTN(kernel) and TLDS == 1: + optSchedule = { + 'SYNC' : [[6,7, 20,21, 46,47, 61]], + 'GRIncA' : [[0,1,2,3,4,4,4,4,4]], + 'GRIncB' : [[5,5,5,5,5,6,6,6,6]], + 'LRA0' : [[0,0, 1,1, 2,2, 3,3]], + 'GRA' : [[8,8,9,9,10,10,11,11,12,12, 23,23,24,24,25,25]], + 'LRB0' : [[13,13,14,14,15,15,16,16]], + 'LRA%u'%plr : [[48,48,49,49,50,50,51,51]], + 'LRB%u'%plr : [[52,52,54,54,55,55,56,56]], + 'GRB' : [[26,26,27,27, 39,39,40,40,41,41,42,42,43,43, 53,53]], + 'LCC' : [[60, 60]], + 'LRSA' : [[17]], + 'LRSB' : [[17]], + 'LWSA' : [[57]], + 'LWSB' : [[57]], + } + syncCode = [SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0/LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0/LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=15, vscnt=-1, comment="Wait for GRA to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for PLR to complete")] + else: + return False, None + + numMfma = 64 + # B0A0, B0A1, B1A0, B1A1 + mfmaReorder = [] + if not kernel["ForceUnrollSubIter"]: + mfmaReorder = [0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27, + 4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31, + 32,33,34,35, 40,41,42,43, 48,49,50,51, 56,57,58,59, + 36,37,38,39, 44,45,46,47, 52,53,54,55, 60,61,62,63] + opt1 = ScheduleInfo(1, numMfma, optSchedule, syncCode, mfmaReorder) + return True, opt1 + +def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): + kernel["MfmaInitCVgprs"] = True + + optSchedule = dict() + syncCode = [] + if isNN(kernel) and useLDSTr and TLDS==1: + # TODO: This schedule can be improved when BC are resolved for MT192 + # Note: A/B Global read orders are swapped + # i.e. GRA contains GR for B + kernel["SwapGlobalReadOrder"] = True + optSchedule = { + 'SYNC' : [[12,13, 47,48,49,50,51, 52,53, 56,56, 94]], + 'GRIncB' : [[0,1,2,3,4,5,6,7,8]], + 'GRIncA' : [[9,10,11,12,13,14,15,16,17]], + 'LRB0' : [[0,0,1,1,2,2,6,8], + [3,3,4,4,5,5,7,9]], + # These local reads have BC + 'LRA0' : [[10, 15,17,19,21,23, 25,27,29,33,37,39], + [11, 14,16,18,20,22, 24,26,28,32,36,38]], + 'GRA' : [[14,14, 16,16, 18,18, 20,20, 22,22, 34,34,36,36,38,38], + [15,15, 17,17, 19,19, 21,21, 23,23, 35,35,37,37,39,39]], + 'GRB' : [[54,54, 56,56, 58,58, 60,60, 62,62, 64,64], + [55,55, 57,57, 59,59, 61,61, 63,63, 65,65]], + 'LRSA' : [[40]], + 'LRSB' : [[40]], + 'LWSB' : [[41]], # For B + 'LWSA' : [[66]], # For A + 'LRB1' : [[57,57,59,59,61,61,63,65], + [58,58,60,60,62,62,64,64]], + 'LRA1' : [[67,71,73,75,77,79,81,85,87,89,91,93], + [68,72,74,76,78,80,82,86,88,90,92,94]], + 'LCC' : [[95, 95]], + } + syncCode = [SWaitCnt(dscnt=1, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=10, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SWaitCnt(dscnt=8, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SWaitCnt(dscnt=6, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SWaitCnt(dscnt=2, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=9, vscnt=-1, comment="Wait for LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"),] + + else: + return False, None + + numMfma = 96 + opt1 = ScheduleInfo(2, numMfma, optSchedule, syncCode) + return True, opt1 + + def hasCustomSchedule(kernel): if not kernel["UseCustomMainLoopSchedule"]: @@ -274,249 +521,11 @@ def hasCustomSchedule(kernel): is192x256x64DTL = [MT0, MT1, DU, PGR, PLR, DTL] == [192, 256, 64, 2, 1, True] is256x256x128DTL = [MT0, MT1, DU, PGR, PLR, DTL] == [256, 256, 128, 2, 0, True] - - transA = kernel["ProblemType"]["TransposeA"] - transB = kernel["ProblemType"]["TransposeB"] - - isNN = transA == False and transB == False - isNT = transA == False and transB == True - isTT = transA == True and transB == True - isTN = transA == True and transB == False - - # Custom main loop scheduling for 256x256x64 16bit if is256x256x64DTL and is16bit and not isMixed and ([GRVWA, GRVWB, LRVW] == [8,8,8]) and MI == [16,16,32,1] and MIWG == [2,2]: - - kernel["MfmaInitCVgprs"] = True - - optSchedule = dict() - syncCode = [] - - if isTN and TLDS == 1: - optSchedule = { - 'SYNC' : [[19,20, 50,51, 67,68, 104, 105]], - 'GRIncA' : [[0,1,2,3,4,5,6,7,8]], - 'GRIncB' : [[9,10,11,12,13,14,15,16,17]], - 'LRA0' : [[0,2,4,6,8,10,12,14], - [1,3,5,7,9,11,13,15]], - 'LRB0' : [[24,27,30,33,36,38,40,42], - [22,25,28,31,34,37,39,41]], - 'GRA' : [[21,22, 23,25, 26,28, 29,31, 32,34, 35,52, 53,55, 56,58], - [21,23, 24,26, 27,29, 30,32, 33,35, 36,53, 54,56, 57,59]], - 'GRB' : [[59,61, 62,64, 65,85, 86,87, 88,89, 94,96, 98,100, 102,124], - [60,62, 63,65, 66,84, 85,86, 87,88, 93,95, 97,99, 103,123]], - 'LRA1' : [[69,71,73,75,77,79,81,83], - [70,72,74,76,78,80,82,90]], - 'LRB1' : [[106,108,110,112,114,116,118,120], - [107,109,111,113,115,117,119,121]], - 'LRSA' : [[16]], - 'LRSB' : [[83]], - 'LWSA' : [[125]], - 'LWSB' : [[125]], - 'LCC' : [[126, 126]], - } - syncCode = [SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=(2 + 8 + 8), vscnt=-1, comment="Wait for previous GRA to completely"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=15, vscnt=-1, comment="Wait for previous GRA to completely"), - SBarrier(comment="")] - elif isNT and not useLDSTr and TLDS == 0: - kernel["UsePLRPack"] = True - - optSchedule = { - 'SYNC' : [[12,13, 36,44, 56,59, 66,68, 73,92]], - 'GRIncA' : [[0,1,2,3,4,5,6,7,8]], - 'GRIncB' : [[28,29,30,31,32,33,34,35,36]], - 'LRA0' : [[0,0,2,2,4,4,6,6], - [1,1,3,3,5,5,7,7]], - 'LRB0' : [[8,8,10,10,15,15,18,18], - [9,9,11,11,14,14,17,17]], - 'GRA' : [[14,14, 17,17, 20,20, 23,23, 26,26, 45,45, 48,48, 51,51], - [15,15, 18,18, 21,21, 24,24, 27,27, 46,46, 49,49, 52,52]], - 'GRB' : [[54,54, 57,57, 87,87,90,90,93,93,96,96,99,99, 123,123], - [55,55, 58,58, 88,88,91,91,94,94,97,97,100,100, 124,124]], - 'LRA1' : [[60,60,62,62,64,64,66,66], - [61,61,63,63,65,65,67,67]], - 'LRB1' : [[69,69,71,71,73,73,75,75], - [70,70,72,72,74,74,76,76]], - 'LRSA' : [[59]], - 'LRSB' : [[59]], - 'LWSA' : [[125]], - 'LWSB' : [[125]], - 'LCC' : [[126, 126]], - 'PackA0' : [[16,16, 19,19, 21,21, 22,22, 24,24, 25,25, 27,27, 28,28, 29,29, 30,30, 31,31, 32,32, 33,33, 34,34, 35,35, 36,36], - [16,16, 19,19, 20,20, 22,22, 23,23, 25,25, 26,26, 28,28, 29,29, 30,30, 31,31, 32,32, 33,33, 34,34, 35,35, 36,36]], - 'PackB0' : [[37,37, 38,38, 39,39, 40,40, 41,41, 42,42, 43,43, 46,46, 47,47, 49,49, 50,50, 52,52, 53,53, 55,55, 56,56, 58,58], - [37,37, 38,38, 39,39, 40,40, 41,41, 42,42, 43,43, 45,45, 47,47, 48,48, 50,50, 51,51, 53,53, 54,54, 56,56, 57,57]], - 'PackA1' : [[74,74, 76,76, 77,77, 78,78, 79,79, 80,80, 81,81, 82,82, 83,83, 84,84, 85,85, 86,86, 88,88, 89,89, 91,91, 92,92], - [75,75, 77,77, 78,78, 79,79, 80,80, 81,81, 82,82, 83,83, 84,84, 85,85, 86,86, 87,87, 89,89, 90,90, 92,92, 93,93]], - 'PackB1' : [[94,94, 95,95, 97,97, 98,98, 100,100, 101,101, 102,102, 103,103, 104,104, 105,105, 106,106, 107,107, 108,108, 109,109, 110,110, 111,111], - [95,95, 96,96, 98,98, 99,99, 101,101, 102,102, 103,103, 104,104, 105,105, 106,106, 107,107, 108,108, 109,109, 110,110, 111,111, 112,112]], - } - syncCode = [SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=17, vscnt=-1, comment="Wait for GRA to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=9, vscnt=-1, comment="Wait for GRB to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA1 to complete"), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB1 to complete")] - elif (isNN or isTT) and not useLDSTr and TLDS == 1: - kernel["UsePLRPack"] = True - - optSchedule = { - 'SYNC' : [[8, 12,13, 36,44, 56,59, 66,68, 73]], - 'GRIncA' : [[0,1,2,3,4,5,6,7,8]], - 'GRIncB' : [[28,29,30,31,32,33,34,35,36]], - 'LRA0' : [[0,0,2,2,4,4,6,6], - [1,1,3,3,5,5,7,7]], - 'LRB0' : [[9,11, 15,18,21,24,27,30], - [10,12, 14,17,20,23,26,29]], - 'GRA' : [[14,14, 17,17, 20,20, 23,23, 26,26, 45,45, 48,48, 51,51], - [15,15, 18,18, 21,21, 24,24, 27,27, 46,46, 49,49, 52,52]], - 'GRB' : [[54,54, 57,57, 87,87,90,90,93,93,96,96,99,99, 123,123], - [55,55, 58,58, 88,88,91,91,94,94,97,97,100,100, 124,124]], - 'LRA1' : [[60,60,62,62,64,64,66,66], - [61,61,63,63,65,65,67,67]], - 'LRB1' : [[68,70,72,74,76,78,80,82], - [69,71,73,75,77,79,81,83]], - 'LRSA' : [[59]], - 'LRSB' : [[59]], - 'LWSA' : [[125]], - 'LWSB' : [[125]], - 'LCC' : [[126, 126]], - 'PackA0' : [[8,8, 16,16, 19,19, 22,22, 25,25, 28,28, 29,29, 31,31, 32,32, 33,33, 34,34, 35,35, 36,36, 37,37, 38,38, 39,39]], - 'PackA1' : [[75,75, 77,77, 79,79, 81,81, 83,83, 84,84, 85,85, 86,86, 88,88, 89,89, 91,91, 92,92, 94,94, 95,95, 97,97, 98,98], - [74,74, 76,76, 78,78, 80,80, 82,82, 84,84, 85,85, 86,86, 87,87, 89,89, 90,90, 92,92, 93,93, 95,95, 96,96, 98,98]], - } - syncCode = [SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 first half to complete"), - SWaitCnt(dscnt=1, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=17, vscnt=-1, comment="Wait for GRA to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=9, vscnt=-1, comment="Wait for GRB to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA1 to complete")] - if isTT: - kernel["SwapGlobalReadOrder"] = True - - optSchedule['GRIncA'], optSchedule['GRIncB'] = optSchedule['GRIncB'], optSchedule['GRIncA'] - optSchedule['LRA0'], optSchedule['LRB0'] = optSchedule['LRB0'], optSchedule['LRA0'] - optSchedule['LRA1'], optSchedule['LRB1'] = optSchedule['LRB1'], optSchedule['LRA1'] - optSchedule['PackB0'] = optSchedule['PackA0'] - optSchedule['PackB1'] = optSchedule['PackA1'] - del optSchedule['PackA0'], optSchedule['PackA1'] - else: - return False, None - - - numMfma = 128 - opt1 = ScheduleInfo(2, numMfma, optSchedule, syncCode) - return True, opt1 + return _get_schedule_256x256x64_16bit(kernel, useLDSTr, TLDS) elif is256x256x128DTL and is8bit and not isMixed and ([GRVWA, GRVWB, LRVW] == [16, 16, 16]) and MI == [16,16,128,1] and MIWG == [2,2]: - - kernel["MfmaInitCVgprs"] = True - - optSchedule = dict() - syncCode = [] - - plr = 3 if kernel["ForceUnrollSubIter"] else 1 - - if isTN and TLDS == 1: - optSchedule = { - 'SYNC' : [[6,7, 20,21, 46,47, 61]], - 'GRIncA' : [[0,1,2,3,4,4,4,4,4]], - 'GRIncB' : [[5,5,5,5,5,6,6,6,6]], - 'LRA0' : [[0,0, 1,1, 2,2, 3,3]], - 'GRA' : [[8,8,9,9,10,10,11,11,12,12, 23,23,24,24,25,25]], - 'LRB0' : [[13,13,14,14,15,15,16,16]], - 'LRA%u'%plr : [[48,48,49,49,50,50,51,51]], - 'LRB%u'%plr : [[52,52,54,54,55,55,56,56]], - 'GRB' : [[26,26,27,27, 39,39,40,40,41,41,42,42,43,43, 53,53]], - 'LCC' : [[60, 60]], - 'LRSA' : [[17]], - 'LRSB' : [[17]], - 'LWSA' : [[57]], - 'LWSB' : [[57]], - } - syncCode = [SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0/LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0/LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=15, vscnt=-1, comment="Wait for GRA to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for PLR to complete")] - else: - return False, None - - numMfma = 64 - # B0A0, B0A1, B1A0, B1A1 - mfmaReorder = [] - if not kernel["ForceUnrollSubIter"]: - mfmaReorder = [0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27, - 4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31, - 32,33,34,35, 40,41,42,43, 48,49,50,51, 56,57,58,59, - 36,37,38,39, 44,45,46,47, 52,53,54,55, 60,61,62,63] - opt1 = ScheduleInfo(1, numMfma, optSchedule, syncCode, mfmaReorder) - return True, opt1 + return _get_schedule_256x256x128_8bit(kernel, useLDSTr, TLDS) elif is192x256x64DTL and is16bit and not isMixed and ([GRVWA, GRVWB, LRVW] == [8, 8, 8]) and MI == [16,16,32,1] and MIWG == [2,2]: - - kernel["MfmaInitCVgprs"] = True - - optSchedule = dict() - syncCode = [] - if isNN and useLDSTr and TLDS==1: - # TODO: This schedule can be improved when BC are resolved for MT192 - # Note: A/B Global read orders are swapped - # i.e. GRA contains GR for B - kernel["SwapGlobalReadOrder"] = True - optSchedule = { - 'SYNC' : [[12,13, 47,48,49,50,51, 52,53, 56,56, 94]], - 'GRIncB' : [[0,1,2,3,4,5,6,7,8]], - 'GRIncA' : [[9,10,11,12,13,14,15,16,17]], - 'LRB0' : [[0,0,1,1,2,2,6,8], - [3,3,4,4,5,5,7,9]], - # These local reads have BC - 'LRA0' : [[10, 15,17,19,21,23, 25,27,29,33,37,39], - [11, 14,16,18,20,22, 24,26,28,32,36,38]], - 'GRA' : [[14,14, 16,16, 18,18, 20,20, 22,22, 34,34,36,36,38,38], - [15,15, 17,17, 19,19, 21,21, 23,23, 35,35,37,37,39,39]], - 'GRB' : [[54,54, 56,56, 58,58, 60,60, 62,62, 64,64], - [55,55, 57,57, 59,59, 61,61, 63,63, 65,65]], - 'LRSA' : [[40]], - 'LRSB' : [[40]], - 'LWSB' : [[41]], # For B - 'LWSA' : [[66]], # For A - 'LRB1' : [[57,57,59,59,61,61,63,65], - [58,58,60,60,62,62,64,64]], - 'LRA1' : [[67,71,73,75,77,79,81,85,87,89,91,93], - [68,72,74,76,78,80,82,86,88,90,92,94]], - 'LCC' : [[95, 95]], - } - syncCode = [SWaitCnt(dscnt=1, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=10, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SWaitCnt(dscnt=8, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SWaitCnt(dscnt=6, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SWaitCnt(dscnt=2, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=9, vscnt=-1, comment="Wait for LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"),] - - else: - return False, None - - numMfma = 96 - opt1 = ScheduleInfo(2, numMfma, optSchedule, syncCode) - return True, opt1 + return _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS) return False, None From 7641aa746d7832a335266bef040ae137d6f23a7d Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 12 Nov 2025 05:51:21 +0000 Subject: [PATCH 02/18] add 192x256x64TN --- .../Tensile/Components/CustomSchedule.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index ed7d59e2efc..c6c9084e22c 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -487,6 +487,42 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): SBarrier(comment=""), SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"),] + elif isTN and not useLDSTr and TLDS == 1: + #index and code pair + syncTable = [8, SBarrier(comment="for GRA start"), + 23, SWaitCnt(dscnt=17, vlcnt=-1, vscnt=-1, comment="for LRB1-4"), + 29, SWaitCnt(dscnt=16, vlcnt=-1, vscnt=-1, comment="for LRB1-5"), + 35, SWaitCnt(dscnt=15, vlcnt=-1, vscnt=-1, comment="for LRB1-6"), + 41, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="for LRB1-7"), + 47, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="for LRA/B1"), + 46, SBarrier(comment="for GRB start"),] + optSchedule = { + 'SYNC' : [syncTable[::2]], + 'GRIncA': [[0,1,2,3,4,5,6,7,8]], + 'GRIncB': [[9,10,11,12,13,14,15,16,17]], + + 'LRA0' : [[0, 2, 3, 4, 5, 6]], + 'LRB0' : [[7, 9, 11, 13, 15, 17, 19, 21], + [8, 10, 12, 14, 16, 18, 20, 22]], + 'GRA' : [[8,8, 10,10, 12,12, 14,14, 26,26, 31,31], + [9,9, 11,11, 13,13, 15,15, 27,27, 32,32]], + + 'GRB' : [[46,46, 50,50, 54,54, 58,58, 62,62, 66,66, 70,70, 77,77], + [47,47, 51,51, 55,55, 59,59, 63,63, 67,67, 71,71, 78,78]], + 'LRA1' : [[48, 52, 56, 58, 60, 64], + [49, 53, 57, 59, 61, 65]], + # 0 1 2 3 4 5 6 7 + 'LRB1' : [[78, 80, 82, 84, 86, 90, 92, 94], + [79, 81, 83, 85, 87, 91, 93, 95]], + + 'LRSA' : [[22]], + 'LRSB' : [[23]], + + 'LWSA' : [[20]], + 'LWSB' : [[78]], + 'LCC' : [[95, 95]], + } + syncCode = syncTable[1::2] else: return False, None From 8f47b794fe3f42815335f66d5f5d107830f9858f Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 12 Nov 2025 05:52:51 +0000 Subject: [PATCH 03/18] typo --- .../hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index c6c9084e22c..a761d940aba 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -487,7 +487,7 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): SBarrier(comment=""), SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"),] - elif isTN and not useLDSTr and TLDS == 1: + elif isTN(kernel) and not useLDSTr and TLDS == 1: #index and code pair syncTable = [8, SBarrier(comment="for GRA start"), 23, SWaitCnt(dscnt=17, vlcnt=-1, vscnt=-1, comment="for LRB1-4"), From 1b39b510c93c55a8814cfaff3e4ef8100b9a66b1 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Mon, 10 Nov 2025 10:49:56 +0000 Subject: [PATCH 04/18] add test --- ...k_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 246 ++++++++++++++++++ .../gfx950/custom_mainloop_scheduling.yaml | 34 ++- 2 files changed, 279 insertions(+), 1 deletion(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index 5b2ca51968f..0ceb68faa3b 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -301206,6 +301206,250 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1iMO6yjTG4yuZPDkcMXrcObwuUOXV_ipLalV5INSQqYE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123776 + LdsInitCVgprs: false + LdsNumBytes: 123776 + LdsNumElementsAlignedA: 24960 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24960 + LdsOffsetB_Blk: 90496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24960 + LdsOffsetMetadata_Blk: 90496 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: 6 + NumTotalPackedLoadsB: 8 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1289 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false - [2, 3, 0, 1] - - - [16, 368640, 1, 224] - [0, 0.0] @@ -303807,6 +304051,8 @@ - [1287, 0.0] - - [16032, 5, 1, 16384] - [1288, 0.0] + - - [3072, 4096, 1, 8192] + - [1289, 0.0] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index 8d0f8886001..d28cc62bdf9 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -386,7 +386,39 @@ BenchmarkProblems: - Range: [[613], [612], [1], [1, 1, 64]] - Exact: [8192, 4096, 1, 2048] - BiasTypeArgs: ['b'] - + - # BenchmarkProblemSizeGroup - Standard - All problem + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16,32, 1, 1, 6, 8, 2,2 ] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - DepthU: [64] + - ScheduleIterAlg: [3] + - ExpandPointerSwap: [0] + - TransposeLDS: [1] #0,1 + - LocalReadVectorWidth: [8] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - DirectToLds: [1] + - StreamK: [3] + - LdsPadA: [8] #[-1] + - LdsPadB: [8] #[-1] + - StaggerU: [0] + - WorkGroupMapping: [16] + - WorkGroupMappingXCC: [2] + - 1LDSBuffer: [0] + - NonTemporalD: [4] + - SourceSwap: [1] + - UseSgprForGRO: [0] + - UseCustomMainLoopSchedule: [0, 1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [3072, 4096, 1, 8192] + - BiasTypeArgs: ['b'] ######################################## # HHS TN - standard ######################################## From 485c703df1c3c2c929494076f66935ef0affca25 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 12 Nov 2025 06:51:41 +0000 Subject: [PATCH 05/18] more test --- .../Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index d28cc62bdf9..524d495ec4c 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -417,6 +417,10 @@ BenchmarkProblems: BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: + - Range: [[192], [256], [1], [64, 64, 256]] + - Range: [[192], [256], [1], [1, 1, 64]] + - Range: [[192], [256], [1], [32, 64, 256]] + - Range: [[6144], [8192], [1], [64, 64, 256]] - Exact: [3072, 4096, 1, 8192] - BiasTypeArgs: ['b'] ######################################## From 5a943a53d98c69e1f25907eb8a072a030967d941 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Fri, 14 Nov 2025 01:48:07 +0000 Subject: [PATCH 06/18] typo --- .../gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index 45b71a16334..9c4bd13b8f6 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -308981,7 +308981,6 @@ - [1306, 0.0] - - [3072, 4096, 1, 8192] - [1308, 0.0] - - null - null - DeviceEfficiency From 461fb4f39c9833c20e108e70de6a619f67c6fe84 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Mon, 17 Nov 2025 03:35:50 +0000 Subject: [PATCH 07/18] fix random fail --- .../Tensile/Components/CustomSchedule.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 1cc3db43c28..aaf95fd3d52 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -347,13 +347,17 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): nglshift = nllshift = 14 # vmcnt shift for ngl and nll elif isTN(kernel) and not useLDSTr and TLDS == 1: #index and code pair - syncTable = [8, SBarrier(comment="for GRA start"), - 23, SWaitCnt(dscnt=17, vlcnt=-1, vscnt=-1, comment="for LRB1-4"), - 29, SWaitCnt(dscnt=16, vlcnt=-1, vscnt=-1, comment="for LRB1-5"), - 35, SWaitCnt(dscnt=15, vlcnt=-1, vscnt=-1, comment="for LRB1-6"), - 41, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="for LRB1-7"), - 47, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="for LRA/B1"), - 46, SBarrier(comment="for GRB start"),] + syncTable = [-1, SWaitCnt(dscnt=7, vlcnt=-1, vscnt=-1, comment="for LRB1-0"), + 6, SWaitCnt(dscnt=6+5, vlcnt=-1, vscnt=-1, comment="for LRB1-1"), + 8, SBarrier(comment="for GRA start"), + 11, SWaitCnt(dscnt=5+8, vlcnt=-1, vscnt=-1, comment="for LRB1-2"), + 17, SWaitCnt(dscnt=4+11, vlcnt=-1, vscnt=-1, comment="for LRB1-3"), + 23, SWaitCnt(dscnt=15, vlcnt=-1, vscnt=-1, comment="for LRB1-4:6"), + 41, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="for LRB1-7"), + 46, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRA1"), + 48, SBarrier(comment="for LRA1 start"), + 78, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRB1"), + 78, SBarrier(comment="for LRB1 start"),] optSchedule = { 'SYNC' : [syncTable[::2]], 'GRIncA': [[0,1,2,3,4,5,6,7,8]], @@ -365,11 +369,11 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): 'GRA' : [[8,8, 10,10, 12,12, 14,14, 26,26, 31,31], [9,9, 11,11, 13,13, 15,15, 27,27, 32,32]], - 'GRB' : [[46,46, 50,50, 54,54, 58,58, 62,62, 66,66, 70,70, 77,77], - [47,47, 51,51, 55,55, 59,59, 63,63, 67,67, 71,71, 78,78]], + 'GRB' : [[46,46, 50,50, 54,54, 58,58, 62,62, 66,66, 70,70, 76,76], + [47,47, 51,51, 55,55, 59,59, 63,63, 67,67, 71,71, 77,77]], 'LRA1' : [[48, 52, 56, 58, 60, 64], [49, 53, 57, 59, 61, 65]], - # 0 1 2 3 4 5 6 7 + # 0 1 2 3 4 5 6 7 'LRB1' : [[78, 80, 82, 84, 86, 90, 92, 94], [79, 81, 83, 85, 87, 91, 93, 95]], @@ -381,6 +385,7 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): 'LCC' : [[95, 95]], } syncCode = syncTable[1::2] + nglshift = nllshift = 14 # vmcnt shift for ngl and nll else: return False, None From ba7d50b93c10e40b87ecec51f46aaea63a7736d5 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Thu, 20 Nov 2025 08:14:01 +0000 Subject: [PATCH 08/18] refine --- ...k_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 432 ++++++++++++++---- .../Tensile/Components/CustomSchedule.py | 6 +- .../gfx950/custom_mainloop_scheduling.yaml | 38 +- 3 files changed, 380 insertions(+), 96 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index 23673509c2c..7a555b3e645 100755 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -299037,7 +299037,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -299074,7 +299074,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 128 LSCB: 128 @@ -299185,7 +299185,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1279 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -299281,7 +299281,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -299318,7 +299318,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 128 LSCB: 128 @@ -299429,7 +299429,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1280 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -299525,7 +299525,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -299562,7 +299562,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -299673,7 +299673,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1281 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -299769,7 +299769,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -299806,7 +299806,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -299917,7 +299917,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1282 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -300013,7 +300013,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -300050,7 +300050,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x352x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_11_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x352x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_11_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -300161,7 +300161,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1283 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x352x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_11_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x352x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_11_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -300257,7 +300257,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -300294,7 +300294,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -300405,7 +300405,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1284 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -300501,7 +300501,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -300538,7 +300538,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -300649,7 +300649,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1285 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -300745,7 +300745,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -300782,7 +300782,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -300893,7 +300893,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1286 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -300989,7 +300989,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -301026,7 +301026,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -301137,7 +301137,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1287 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -301233,7 +301233,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -301270,7 +301270,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -301381,7 +301381,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1288 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -301477,7 +301477,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -301514,7 +301514,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -301625,7 +301625,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1289 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -301721,7 +301721,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -301758,7 +301758,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -301869,7 +301869,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1290 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -301965,7 +301965,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -302002,7 +302002,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 128 LSCB: 128 @@ -302113,7 +302113,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1291 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -302209,7 +302209,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -302246,7 +302246,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -302357,7 +302357,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1292 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -302453,7 +302453,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -302490,7 +302490,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -302601,7 +302601,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1293 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -302697,7 +302697,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -302734,7 +302734,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -302845,7 +302845,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1294 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -302941,7 +302941,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -302978,7 +302978,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -303089,7 +303089,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1295 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -303185,7 +303185,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -303222,7 +303222,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -303333,7 +303333,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1296 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -303429,7 +303429,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -303466,7 +303466,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -303577,7 +303577,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1297 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -303673,7 +303673,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -303710,7 +303710,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 128 LSCB: 128 @@ -303821,7 +303821,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1298 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -303917,7 +303917,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -303954,7 +303954,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 128 LSCB: 128 @@ -304065,7 +304065,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1299 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -304161,7 +304161,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -304198,7 +304198,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -304309,7 +304309,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1300 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -304405,7 +304405,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -304442,7 +304442,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 128 LSCB: 128 @@ -304553,7 +304553,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1301 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -304649,7 +304649,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -304686,7 +304686,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -304797,7 +304797,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1302 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -304893,7 +304893,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -304930,7 +304930,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -305041,7 +305041,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1303 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -305137,7 +305137,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -305174,7 +305174,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -305285,7 +305285,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1304 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -305381,7 +305381,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -305418,7 +305418,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -305529,7 +305529,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1305 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -305625,7 +305625,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -305662,7 +305662,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -305773,7 +305773,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1306 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -305869,7 +305869,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -305906,7 +305906,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -306017,7 +306017,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1307 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -306113,7 +306113,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 0 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -306150,7 +306150,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x240x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x240x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -306261,7 +306261,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1308 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x240x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCCn1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x240x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCCn1_WGMXCCGn1 SourceSwap: true SpaceFillingAlgo: [] StaggerU: 8 @@ -306357,7 +306357,7 @@ CUCount: null CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: '4' + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -306394,7 +306394,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x208x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x208x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -306505,7 +306505,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1309 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x208x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x208x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: true SpaceFillingAlgo: [] StaggerU: 32 @@ -306584,6 +306584,250 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1iMO6yjTG4yuZPDkcMXrcObwuUOXV_ipLalV5INSQqYE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123776 + LdsInitCVgprs: false + LdsNumBytes: 123776 + LdsNumElementsAlignedA: 24960 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24960 + LdsOffsetB_Blk: 90496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24960 + LdsOffsetMetadata_Blk: 90496 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: 6 + NumTotalPackedLoadsB: 8 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1310 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false - [2, 3, 0, 1] - - - [16, 368640, 1, 224] - [0, 0.0] @@ -309227,6 +309471,8 @@ - [1308, 0.0] - - [4096, 3328, 1, 8192] - [1309, 427.47] + - - [3072, 4096, 1, 8192] + - [1310, 0.0] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index bf3e5d5c9c0..9e93647b1bb 100755 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -348,15 +348,17 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): elif isTN(kernel) and not useLDSTr and TLDS == 1: #index and code pair syncTable = [-1, SWaitCnt(dscnt=7, vlcnt=-1, vscnt=-1, comment="for LRB1-0"), - 6, SWaitCnt(dscnt=6+5, vlcnt=-1, vscnt=-1, comment="for LRB1-1"), + 5, SWaitCnt(dscnt=6+4, vlcnt=-1, vscnt=-1, comment="for LRB1-1"), 8, SBarrier(comment="for GRA start"), 11, SWaitCnt(dscnt=5+8, vlcnt=-1, vscnt=-1, comment="for LRB1-2"), 17, SWaitCnt(dscnt=4+11, vlcnt=-1, vscnt=-1, comment="for LRB1-3"), 23, SWaitCnt(dscnt=15, vlcnt=-1, vscnt=-1, comment="for LRB1-4:6"), 41, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="for LRB1-7"), 46, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRA1"), + 47, SWaitCnt(dscnt=7, vlcnt=-1, vscnt=-1, comment="for LRB0-0"), 48, SBarrier(comment="for LRA1 start"), - 78, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRB1"), + 51, SWaitCnt(dscnt=1, vlcnt=-1, vscnt=-1, comment="for LRB1"), + 78, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRB0"), 78, SBarrier(comment="for LRB1 start"),] optSchedule = { 'SYNC' : [syncTable[::2]], diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index cc00205011d..a0130ceaccd 100755 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -579,7 +579,43 @@ BenchmarkProblems: - Range: [[8192], [6656], [1], [64, 64, 256]] - Exact: [4096, 3328, 1, 8192] - BiasTypeArgs: ['b'] - + - # BenchmarkProblemSizeGroup - Standard - All problem + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16,32, 1, 1, 6, 8, 2,2 ] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - DepthU: [64] + - ScheduleIterAlg: [3] + - ExpandPointerSwap: [0] + - TransposeLDS: [1] #0,1 + - LocalReadVectorWidth: [8] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - DirectToLds: [1] + - StreamK: [3] + - LdsPadA: [8] #[-1] + - LdsPadB: [8] #[-1] + - StaggerU: [0] + - WorkGroupMapping: [16] + - WorkGroupMappingXCC: [2] + - 1LDSBuffer: [0] + - NonTemporalD: [4] + - SourceSwap: [1] + - UseSgprForGRO: [0] + - UseCustomMainLoopSchedule: [0, 1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [[192], [256], [1], [64, 64, 256]] + - Range: [[192], [256], [1], [1, 1, 64]] + - Range: [[192], [256], [1], [32, 64, 256]] + - Range: [[6144], [8192], [1], [64, 64, 256]] + - Exact: [3072, 4096, 1, 8192] + - BiasTypeArgs: ['b'] ######################################## # HHS TN - standard ######################################## From 4d3c807258f4e3560b75cf79d53eb0b951da2433 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Fri, 21 Nov 2025 02:08:07 +0000 Subject: [PATCH 09/18] remove typo --- ...k_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 173754 +-------------- 1 file changed, 1 insertion(+), 173753 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index 0dbc76ce452..e6c4c3ae58d 100755 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -135970,173756 +135970,4 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x10M96ahVYtg9nz7wVE0_jilGKPCC2Hn7G5j6hk7n8Cq4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29440 - LdsInitCVgprs: false - LdsNumBytes: 29440 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 585 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1aItiZpnpRjcYIurxfBmFGJqO0Ddnngzyn750XF40uwQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29824 - LdsInitCVgprs: false - LdsNumBytes: 29824 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 586 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xI2u-Ror5FqkQ3j_XB1vG0HfqV0MUB2v1g9i5PeE9dK0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB4_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62336 - LdsInitCVgprs: false - LdsNumBytes: 62336 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 587 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB4_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1tIELiWthd6KUjMdtTQSxbaPibBS-rnYxP9egrRr9Qkg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB0_NTC2_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12480 - LdsInitCVgprs: false - LdsNumBytes: 12480 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 588 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB0_NTC2_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1xcpM29sdXgq9Obmz4PsmmNsNPdSFmNPCzaXJ7HLy9wE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25216 - LdsInitCVgprs: false - LdsNumBytes: 25216 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 589 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1bAwHyQ3GvY3vKl9IMh8l8so9YqF-GnfsyYCXe2IpfsU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29824 - LdsInitCVgprs: false - LdsNumBytes: 29824 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 590 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x64_MI16x1EUpWGBFC9M8r9mtOXEn2kt2875FYVpKPvAYu5RNAIJM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB2_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29440 - LdsInitCVgprs: false - LdsNumBytes: 29440 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 591 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB2_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x1JtNQNtsnA1Sv8b8yh4PSwgVqK7nqobXjvnB5dDn1KZ0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC5_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 50688 - LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 592 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC5_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT80x64x64_MI16x1jvoQqF8xs6wPfM77e_0lXt27oInqZdCHYYvS2jjTZjs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_1_MO40_NTn1_NTA1_NTB2_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53504 - LdsInitCVgprs: false - LdsNumBytes: 53504 - LdsNumElementsAlignedA: 11520 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 11520 - LdsOffsetB_Blk: 44288 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 11520 - LdsOffsetMetadata_Blk: 44288 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 1] - MIWaveTileA: 5 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 64 - MacroTileA: 80 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 20 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 10 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 593 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_1_MO40_NTn1_NTA1_NTB2_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 1 - ThreadTileA: 20 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x12bQCSUgEYzHwc-PD0joW94qw2tOVZsB4YM94lzol2GQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 54656 - LdsInitCVgprs: false - LdsNumBytes: 54656 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 594 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xH274nfviM8iTEC9EqVMp7KNIKr5Tu4fbGuMgZChQLwI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 - LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 595 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI16xeitizeynoDBPdmSwRBF5Xg6yeYRaHhmb2AuHnk-Jjxw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC4_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 - LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 596 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC4_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x128x64_MI16xIyCic_6SCaQHmRo4gZpospLo38ih23M3n0i8gSNlkM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB5_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 103552 - LdsInitCVgprs: false - LdsNumBytes: 103552 - LdsNumElementsAlignedA: 21120 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 21120 - LdsOffsetB_Blk: 86656 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 21120 - LdsOffsetMetadata_Blk: 86656 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 597 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB5_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xf6FAbndEq795AH2jLpLXyL_zsKaM-nx7XEfVRnU4xGg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 598 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x512_MI16x9RYSCMS1YOFrGPF7jBc1TMhLBlf1ljMGKxQVtEqS2Lg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB2_NTC1_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 2048 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115712 - LdsInitCVgprs: false - LdsNumBytes: 115712 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 599 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB2_NTC1_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xSF36trc75_ee7cIk05feZIYCwFPm6GAFiS2-I-5FAbA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 600 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x128_MI16xtDurztqqli3v-jsi4_HIWyYY_P3H0AZYLDd26dnVkks= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 601 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT48x64x256_MI16xkAW-i3iRPLi_hWhZCB7bemxTlrcNdInt5o4XHUQLbtM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB0_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 64 - MacroTileA: 48 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 602 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB0_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x80x256_MI16xs81qgGS4sNKWxhufdlT4wTK1vEJhN5Ffxrcx9SpGsKM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x80x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_5_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 156672 - LdsInitCVgprs: false - LdsNumBytes: 156672 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 43520 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 78336 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 113152 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 113152 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 5] - MIWaveTileA: 1 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 80 - MacroTileA: 64 - MacroTileB: 80 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 20 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 603 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x80x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_5_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 5 - ThreadTileA: 4 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x96x128_MI16xrLeoctTCB1bACoQ--beNqEzb0YI8LOUhpdHAYkRgIvY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA7_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 55296 - LdsInitCVgprs: false - LdsNumBytes: 55296 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 55296 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 96 - MacroTileA: 96 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 604 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA7_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x64x128_MI16xfNGTqyHzTAgsGJT4k9BPcwtHxtde0XJty9p1gmf49y8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA4_NTB2_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110080 - LdsInitCVgprs: false - LdsNumBytes: 110080 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 1] - MIWaveTileA: 6 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 605 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA4_NTB2_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 1 - ThreadTileA: 24 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT48x64x256_MI16xTlqyDmTkj4a9CbMyFh6-oJ7vLAZn4NKSD09VWqfaX6s= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB2_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 64 - MacroTileA: 48 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 606 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB2_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x256_MI16xLmmGlj_Aj8A6fIMmFhO1dylq0PzddWmPhFmOjV8mNYs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 136192 - LdsInitCVgprs: false - LdsNumBytes: 136192 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 68096 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 607 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x96x128_MI16xTLhvHuGGTrNNoRNX0qiCIT3prWYKTLd7nnYcbxeKw70= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA5_NTB0_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120832 - LdsInitCVgprs: false - LdsNumBytes: 120832 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 96 - MacroTileA: 96 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 608 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA5_NTB0_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16v7CFS67dToWu7E36jBzM3hnxFhhWEF1cSNV18qHrtEk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA1_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 2] - MIWaveTileA: 6 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 609 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA1_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 2 - ThreadTileA: 24 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16ZWO7HpjPNmu6UA2V1hsB04cMUqxmkdqiKg8wJvcIyHk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA7_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 2] - MIWaveTileA: 6 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 610 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA7_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 2 - ThreadTileA: 24 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI19h3rQP9aLCc3mGFwqilyX_IB29uLEdD2KWe4y57nhKw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 611 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1CxJiV14JTrK3H1ybea3EPczFFJRckm5EFanYUzn3ORk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC7_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 612 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC7_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x96x128_MI16xHJIMpCJEpG_CXiSzPpjlBMrZYiH1WeXOfw9riY9KTWc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120832 - LdsInitCVgprs: false - LdsNumBytes: 120832 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 96 - MacroTileA: 96 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 613 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI16ngeiZGZG7mL9cfPgvCQlNFzzUu9wWiYYYs_RgDPtw9s= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 614 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI1673d8kHswsq18xA7tmJKkurm3XT4Q-HnKYoAezcsdn54= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA2_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 615 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA2_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI32csEMSPPsusU5YMb_wufq8F6P-8Sxs2EgAC0PPFEv6gk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA2_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 616 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA2_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM16_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI1689RN_I3TvC1Gm0Z2xealtErSeUtX5KtCYijpRpZlPVk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 2] - MIWaveTileA: 6 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 617 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 2 - ThreadTileA: 24 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16AzS5NoxFk2Au_HbPsvaytavfJD8hghd_sJGfMrTtqqc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA3_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 2] - MIWaveTileA: 6 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 618 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA3_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 2 - ThreadTileA: 24 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x64_MI16AKi47eQ72s767fgFKjHxBMB61xROY48hH_ZC9Dg-uL4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA2_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 56832 - LdsInitCVgprs: false - LdsNumBytes: 56832 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 30720 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 56832 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 3] - MIWaveTileA: 12 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 192 - MacroTileA: 192 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 619 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA2_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 3 - ThreadTileA: 48 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x224x64_MI16vPxjaC7jAP1WCZxG5tdxVnhuIV2nrtGe7eEx5M_A97k= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA2_NTB2_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 - LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 35840 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 7] - MIWaveTileA: 6 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 224 - MacroTileA: 192 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 168 - NumGlobalWriteVectorsPerThread: 84 - NumLoadsA: 6 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 620 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA2_NTB2_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 7 - ThreadTileA: 24 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16UfrTfwRtCk7X5b0oDzl9lO69mf5F9FKhG38KQAizz0I= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 125440 - LdsInitCVgprs: false - LdsNumBytes: 125440 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 4] - MIWaveTileA: 12 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 24 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 24 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 621 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x48x64_MI16x1zwucDcbYqoLc5jj3KVOwMSgVTbe-k-vlwhLtkP43A7w= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB1_NTC0_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 31168 - LdsInitCVgprs: false - LdsNumBytes: 31168 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 6400 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 48 - MacroTileA: 64 - MacroTileB: 48 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 6 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 622 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB1_NTC0_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI16xhFp48AWAx0WFxNJS_zwhp3LH75FnBeuIGQ-PHfRGV8M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA7_NTB3_NTC1_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 - LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 623 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA7_NTB3_NTC1_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x192x64_MI16xDc85R6FBHvP3Db6U89LaQsZiTgNI0YXqfD8EZrc9sRY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99584 - LdsInitCVgprs: false - LdsNumBytes: 99584 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 74240 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 192 - MacroTileA: 64 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 624 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16vPoTMjKt8nWTFpbxSId3wmEKYDEB48u2cWczx9X8ORY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA6_NTB3_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 625 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA6_NTB3_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16BOWImfSEsD_3G-o1iXGR5QlXGk-JQslWP21I5MyceTU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA7_NTB0_NTC4_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 6] - MIWaveTileA: 4 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 626 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA7_NTB0_NTC4_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 6 - ThreadTileA: 16 - ThreadTileB: 6 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x160x64_MI16xWi346OG8MPM52jqLnWZLZZZ3iPSXNJNwpzIVgL5wxlU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA4_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65024 - LdsInitCVgprs: false - LdsNumBytes: 65024 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 160 - MacroTileA: 64 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 20 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 20 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 627 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA4_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16PnbeKpDjqDFQuRGaKM1Bxep0ty_w-huZ0GCiXdgUOH0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 628 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16dZECwM1-ZUT8NrMGVw0isp2_dkCorYlpeB97j2fIEDQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB3_NTC6_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 629 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB3_NTC6_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16jYuVIUs7OMwAdUxYUsQi4ZYOBPt9XeBE6EvrTgfVOXM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA4_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 104064 - LdsInitCVgprs: false - LdsNumBytes: 104064 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 16 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 630 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA4_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x128x64_MI160ChZLQK1-RbLvH-HVpy6jN_NmDI3jJlXjB9H7W24k_M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA2_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 105472 - LdsInitCVgprs: false - LdsNumBytes: 105472 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [10, 2] - MIWaveTileA: 10 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 20 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 20 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 631 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA2_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 2 - ThreadTileA: 40 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16nbKJcGCHBDcxSonpixiDJ1Xzs9zasEp0fEz4y57YzZM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB3_NTC4_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 632 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB3_NTC4_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Qb0kRw4kC1BW5Axl2fZWWgBa72Xc93g5inENYkl8kQ4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 633 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT112x16x64_MI16xSPOJU0ZSldRFeT5QIftb2Rv0LMrYsuCg4tJSJO98EbA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA4_NTB3_NTC1_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49728 - LdsInitCVgprs: false - LdsNumBytes: 49728 - LdsNumElementsAlignedA: 14848 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 14848 - LdsOffsetB_Blk: 47616 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14848 - LdsOffsetMetadata_Blk: 47616 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [7, 1] - MIWaveTileA: 7 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 112 - MacroTile1: 16 - MacroTileA: 112 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 28 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 14 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 2 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 634 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA4_NTB3_NTC1_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 1 - ThreadTileA: 28 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT112x32x64_MI16xhCkqxVO4S_u07UIACIbdXayZs6cLW0nwjm56at5G-TY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51840 - LdsInitCVgprs: false - LdsNumBytes: 51840 - LdsNumElementsAlignedA: 14848 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 14848 - LdsOffsetB_Blk: 47616 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14848 - LdsOffsetMetadata_Blk: 47616 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [7, 1] - MIWaveTileA: 7 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 112 - MacroTile1: 32 - MacroTileA: 112 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 28 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 7 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 2 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 635 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 1 - ThreadTileA: 28 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x48x64_MI16x9yvEFw2CY7fogdvm7chr1dqncsv-JhOXUYRyF96wk04= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 6912 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 51200 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 48 - MacroTileA: 128 - MacroTileB: 48 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 16 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 636 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xlATdcDJmHjTh2fb3_R4sw8cZpm4iDPSIziZbeZj6_2E= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58880 - LdsInitCVgprs: false - LdsNumBytes: 58880 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 637 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x64_MI16xuf_mQQYY6e_xseewo_eCdO8-75Ww2keRq6k-6Kte8TQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA7_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61184 - LdsInitCVgprs: false - LdsNumBytes: 61184 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 11520 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 638 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA7_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x16x64_MI16xphmWFUPXbOcidjCYWA4-uWMzUY_LXmzcCzaU91ZAfzg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x16x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA4_NTB2_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 74240 - LdsInitCVgprs: false - LdsNumBytes: 74240 - LdsNumElementsAlignedA: 71680 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 71680 - LdsOffsetB_Blk: 202752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 74240 - LdsOffsetMetadata_Blk: 202752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [7, 1] - MIWaveTileA: 7 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 448 - MacroTile1: 16 - MacroTileA: 448 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 28 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 14 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 639 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x16x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA4_NTB2_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 1 - ThreadTileA: 28 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x32x64_MI16xyGJzXCsePrx3w8oPN_5-Z4ms4U5FQdG0p8cPMrUh6p0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_1_MO40_NTn1_NTA5_NTB1_NTC3_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 - LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 64512 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 64512 - LdsOffsetB_Blk: 134144 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 64512 - LdsOffsetMetadata_Blk: 134144 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [14, 1] - MIWaveTileA: 14 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 448 - MacroTile1: 32 - MacroTileA: 448 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 56 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 14 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 640 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_1_MO40_NTn1_NTA5_NTB1_NTC3_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 56 - ThreadTile1: 1 - ThreadTileA: 56 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x64x64_MI16xAem7wjfwMpeIdy5IePcpzvkIPguZG_-48eS9Utsv2B0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT14_1_MO40_NTn1_NTA6_NTB0_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 42496 - LdsInitCVgprs: false - LdsNumBytes: 42496 - LdsNumElementsAlignedA: 32256 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32256 - LdsOffsetB_Blk: 97792 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 42496 - LdsOffsetMetadata_Blk: 97792 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [14, 1] - MIWaveTileA: 14 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 64 - MacroTileA: 224 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 56 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 7 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 641 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT14_1_MO40_NTn1_NTA6_NTB0_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 56 - ThreadTile1: 1 - ThreadTileA: 56 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x96x64_MI16xwVNXPPje9A9Oux2i66AN2WtWvGx_LcqCLKCNdupasEs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_3_MO40_NTn1_NTA7_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 159744 - LdsInitCVgprs: false - LdsNumBytes: 159744 - LdsNumElementsAlignedA: 64512 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 79872 - LdsOffsetB: 64512 - LdsOffsetB_Blk: 144384 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 64512 - LdsOffsetMetadata_Blk: 144384 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [14, 3] - MIWaveTileA: 14 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 448 - MacroTile1: 96 - MacroTileA: 448 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 168 - NumGlobalWriteVectorsPerThread: 84 - NumLoadsA: 14 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 642 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_3_MO40_NTn1_NTA7_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 56 - ThreadTile1: 3 - ThreadTileA: 56 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1Zxj7hmIhFf0e6sHLkyALBfq6r73zPsZ-XR3BoqHDSgk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB1_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12480 - LdsInitCVgprs: false - LdsNumBytes: 12480 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 643 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB1_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1XQlhZiKbwBTnsXPxp1q4o2irOq2zfp2Y65C9IZ3YoMQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12480 - LdsInitCVgprs: false - LdsNumBytes: 12480 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 644 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x64_MI16x1RDT_SAbG1Q4m8tjAod_ZUY3haGM8M2J7D6id1FgbeLE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14592 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 645 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x48x64_MI16x1MdzKbimZAKsiTS822naX-Cj47npLSbYpMQ1E6wBqIR4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA3_NTB3_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24896 - LdsInitCVgprs: false - LdsNumBytes: 24896 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 6400 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 18560 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 18560 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 48 - MacroTileA: 16 - MacroTileB: 48 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 6 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 646 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA3_NTB3_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x64x64_MI16x1ET1ujsY8nygIYm_fI--IZvnOtQX-vgf9FL_uuSis_dM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 27008 - LdsInitCVgprs: false - LdsNumBytes: 27008 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 18560 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 18560 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 647 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1ZK4lPjLxPtESxVESaBb0bjZvzm3PXcRqVZ9QNTtsFmU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25216 - LdsInitCVgprs: false - LdsNumBytes: 25216 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 648 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1ryhvjQy9Rt7q0NXyJWBm18l0Me0RTEddt73F0RdSnWg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29440 - LdsInitCVgprs: false - LdsNumBytes: 29440 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 649 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xWDZ6BX4xpSToEnwcbyhgbIcvdSHipUU7aRUJWaWZoAQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62336 - LdsInitCVgprs: false - LdsNumBytes: 62336 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 650 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x64_MI16x1Z0vg-bZ_gL9MhYmVUEa6Ri_Qp759nhBFrg1Cxoodnjg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14592 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 651 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1rPu3O7LZ76vL2u7y5rr3EWOiCmLkbwq5NK6qXHp6gKc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB2_NTC2_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29824 - LdsInitCVgprs: false - LdsNumBytes: 29824 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 652 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB2_NTC2_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1tDT_hrw3fHD8_o3Zp1b1tGE2weQyHC32kJu7wlmKPxc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29440 - LdsInitCVgprs: false - LdsNumBytes: 29440 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 653 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x96x64_MI16x1-HBv_7E9Iz2YhgL1w1MXVfJP3ZE20choBF-i3eydDf8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 - LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 13824 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 37376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 37376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 96 - MacroTileA: 32 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 12 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 12 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 654 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x1B4FrTyt9kfGdKb4c_hmAfgg7GdQSww8Qzv_iE0c3gi0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA1_NTB3_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 3 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 655 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA1_NTB3_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x1JEVCxGr47CIY-HI8qxrZmKgrc1wM-_JZuDpX1WhY7IM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA1_NTB2_NTC4_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 50688 - LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 656 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA1_NTB2_NTC4_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1P7fU9OFzqfhEM9o3ZnBDL2oiSqY7E--X2JskGaL9FoA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24832 - LdsInitCVgprs: false - LdsNumBytes: 24832 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 657 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1BLbdM9dbMt3NUAhJq-FfugXosmhxsqmK_BkcQ5OAhOg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 54656 - LdsInitCVgprs: false - LdsNumBytes: 54656 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 658 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xbKVSGGFgDw5i8McKOQa-A53vhnlFKW7c72QLmlhQisQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB3_NTC4_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58880 - LdsInitCVgprs: false - LdsNumBytes: 58880 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 659 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB3_NTC4_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xOJCUhlFECuCTwzXpkC6jiu2Ixkoz_9HWuHIxaAxXqh8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58880 - LdsInitCVgprs: false - LdsNumBytes: 58880 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 660 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16x8Lu9f2yKHuTv8Iy85nfNk3O3KnHLpzbK3KxUw1Mbok4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62848 - LdsInitCVgprs: false - LdsNumBytes: 62848 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 16 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 661 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x512_MI16xXRxh_faCRRltjL034xoMmk_TiKa3xdJCa74hUHvvHEU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 662 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x48x512_MI16xRWoVyAvU8jwb3j6CRTvndH2SL3yYuGMQRd2NzD81sLI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x48x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA6_NTB1_NTC0_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 50688 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 84480 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 84480 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 48 - MacroTileA: 16 - MacroTileB: 48 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 3 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 12 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 12 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 663 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x48x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA6_NTB1_NTC0_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x16x256_MI16xCiygDODDK4Fa2btyaNoWwcgWWpgaEN3yAsezRpzjY5c= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 107776 - LdsInitCVgprs: false - LdsNumBytes: 107776 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 664 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xiervKaX2K61CRRejq1AG3_Z65vVaHOJ0L6-2dHWApwY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 665 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x256_MI16x7_taCQeo5Fw91ch_k5selL-SvvOT-Li1hQ45Oq47or0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 666 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x16x128_MI16xspEMKZ93Myckc_GOKIntGOAz2o43UgKBk4YuGWRg8fk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC2_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 55808 - LdsInitCVgprs: false - LdsNumBytes: 55808 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 51200 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 667 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC2_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xi-yvAevSwW9s0hapSLs7FgKWw6ot3l97yFKd4adNR2U= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB0_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 668 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB0_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xOfKmRAiyYbD1ongki4w-aOH6B_Asv5QGII7Zid4nDtU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB1_NTC2_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 669 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB1_NTC2_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xIj575u2gUPbRKt9h6HW8Brd0cnlHlMiSmmqDVlgfL7I= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 670 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x128_MI16xZ49CBLf5ScwOBmtxfyDIZSZP9YOFJqvvP643E4q-8r8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA3_NTB1_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 - LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 671 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA3_NTB1_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI169Pt6LbH02NexWojOA7qAB2-Fm2qPQ2e49zKWSkq3aQ4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 672 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16cwc8HxsQarpjYLPbWETQe3uyARxl1l74SNVlAyjgLDQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 673 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x128_MI16VUU44mm8BMcnxwEtHA89E_6df0UR4aTy2W8vQlkVcbA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 674 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x128_MI16ksL5sGakGBodVE2S4KzleS-f8oU1KqielXgO3mZSIQw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 - LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 675 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x1MjIFVZajzfoFkPKUGd62NeM2UNclTA5fyapd57LKLMA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA1_NTB2_NTC5_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 676 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA1_NTB2_NTC5_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x160x128_MI16NeMdRGaNIunm3_27NSGWaZZR4mRdMr8hAhKtRPCxJTM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 129024 - LdsInitCVgprs: false - LdsNumBytes: 129024 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 160 - MacroTileA: 64 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 677 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x192x128_MI16nXFspfQ1n-JzXHNxfWztMu23CLya1Pfo9pLNlbt2Q7Q= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 144384 - LdsInitCVgprs: false - LdsNumBytes: 144384 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 55296 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 72192 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 89088 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 89088 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 192 - MacroTileA: 64 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 12 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 12 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 678 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI16tAd_lkigyTdEYRjzMIK5_8l6RnyYhnBdukQFBQ-gD0M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA1_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 679 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA1_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1Dfy4Xf_-nNJ9qTF6JJW4iktgBz4PP6t0_lQo70vu8Pw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB3_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 - LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 680 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB3_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16TAya8fP-9detORb1PiFXCjXxw-JyWSf1Yy8FSpejObs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA3_NTB3_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 2] - MIWaveTileA: 6 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 681 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA3_NTB3_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 2 - ThreadTileA: 24 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x96x64_MI16xpq67mkJqOOTRU5Z-xeSs2N2SBCRuU-O8RBVF0fAkxgo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA3_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 112000 - LdsInitCVgprs: false - LdsNumBytes: 112000 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 3] - MIWaveTileA: 8 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 96 - MacroTileA: 256 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 682 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA3_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16F5-Y3heVCa_launRUn-KSz1wHCDBn8ZAWYAzxkNo_44= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 - LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 15360 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 15360 - LdsOffsetB_Blk: 48128 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 15360 - LdsOffsetMetadata_Blk: 48128 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 6] - MIWaveTileA: 6 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 192 - MacroTileA: 192 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 683 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 6 - ThreadTileA: 24 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x128x64_MI16gWrmBtg2_i8eryiIALzrySUiqdyXxbV8TJwUnamyhPQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_2_MO40_NTn1_NTA2_NTB2_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 44544 - LdsInitCVgprs: false - LdsNumBytes: 44544 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 44544 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 2] - MIWaveTileA: 12 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 684 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_2_MO40_NTn1_NTA2_NTB2_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 2 - ThreadTileA: 48 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16EzTqgiXgPBF_ARPg_cK-JoXMOyRPW6J3lOFjOMJa4Ho= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA2_NTB2_NTC1_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 125440 - LdsInitCVgprs: false - LdsNumBytes: 125440 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 4] - MIWaveTileA: 12 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 24 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 24 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 685 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA2_NTB2_NTC1_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x18L2jmuT-hMIzEZWLTh1LIKclLsLpudAVc-YTJZ2phKQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 50688 - LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 686 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI16xDCe3l8na3f27VLtnVlzFGVavj96L_Azsll4KulPHqsI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA6_NTB0_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 - LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 687 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA6_NTB0_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xVIw57mRIRYH0m_CcIZDK-p-YdVEXWxLXeDaRPF09Dbw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA6_NTB1_NTC0_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62336 - LdsInitCVgprs: false - LdsNumBytes: 62336 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 688 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA6_NTB1_NTC0_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16z31fl2N-0jv2niJNtBtZg1a7wQy6gEnUqHfWWGXiT3E= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA6_NTB2_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 689 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA6_NTB2_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16c4B1pd6abobRt0oLrz-drDK66b4TY25EF2_dOH_x7U4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA7_NTB0_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 6] - MIWaveTileA: 4 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 690 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA7_NTB0_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 6 - ThreadTileA: 16 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x192x64_MI16xwNdA3XAVZgNp7B97A8M72_NWG1tfTcIrxSB0QVt12A0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA4_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99584 - LdsInitCVgprs: false - LdsNumBytes: 99584 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 74240 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 192 - MacroTileA: 64 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 691 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA4_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16tGuzVf8otnHrXADhFHX3DGYqRygBCmdCoE5Fl-ZtZ1Q= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA6_NTB3_NTC1_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99840 - LdsInitCVgprs: false - LdsNumBytes: 99840 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 692 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA6_NTB3_NTC1_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16cvElwXhZzoI5bHsRvbpqVI_TQ8sdHbKbNSqGN-UZTOg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 693 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16eECx5F9F_BhjbKQ29yTAgPVBEIVPLCxsVGWNpAn05IU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA4_NTB0_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 104064 - LdsInitCVgprs: false - LdsNumBytes: 104064 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 16 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 694 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA4_NTB0_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16QXtD7oXSMnmIF8hDGFn7uvC5RdO-XbtSlTg2Daq-GCE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA6_NTB2_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 107776 - LdsInitCVgprs: false - LdsNumBytes: 107776 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 3] - MIWaveTileA: 8 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 695 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA6_NTB2_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16kGGVlLQYO67WzbEjuBWcMuovcPe3dUezUjRu8kulS10= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 696 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x16x64_MI16xzAdJSKxtc6G2i4oH_Hj-ERRE93AFOiKixcHMN4uLcXQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA6_NTB1_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 4 - LVCA: 8 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51968 - LdsInitCVgprs: false - LdsNumBytes: 51968 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 697 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA6_NTB1_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x16x64_MI16x_nXPOwLHN3tUCPcweu7IEYYExrIq38iv1_FscmoR-Aw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA4_NTB2_NTC2_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51776 - LdsInitCVgprs: false - LdsNumBytes: 51776 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [8, 1] - MIWaveTileA: 8 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 1 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 698 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA4_NTB2_NTC2_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT112x32x64_MI16xKLw0NFHWmTaLgGANNLe0WN9LcfTlWk51L14snqqNok8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA1_NTB2_NTC0_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51840 - LdsInitCVgprs: false - LdsNumBytes: 51840 - LdsNumElementsAlignedA: 14848 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 14848 - LdsOffsetB_Blk: 47616 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14848 - LdsOffsetMetadata_Blk: 47616 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [7, 1] - MIWaveTileA: 7 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 112 - MacroTile1: 32 - MacroTileA: 112 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 28 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 7 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 2 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 699 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA1_NTB2_NTC0_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 1 - ThreadTileA: 28 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x48x64_MI16x9wGPz7-Mii8l-IhgHa7zhP1w2g2Ym4tR2F7tq6DEi0Q= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA4_NTB2_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 56576 - LdsInitCVgprs: false - LdsNumBytes: 56576 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 6912 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 48 - MacroTileA: 128 - MacroTileB: 48 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 6 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 700 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA4_NTB2_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xPabkvQBq6PTiaDnOaVG-y_ngipkRJNKgbjS24MBfNqk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA4_NTB3_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58880 - LdsInitCVgprs: false - LdsNumBytes: 58880 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 1] - MIWaveTileA: 8 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 701 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA4_NTB3_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT112x128x64_MI16TfLTsnYrpucGsLz8gM1xsamGYVXiPxeFu05ySX1YaOk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT7_2_MO40_NTn1_NTA4_NTB3_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98560 - LdsInitCVgprs: false - LdsNumBytes: 98560 - LdsNumElementsAlignedA: 16128 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16128 - LdsOffsetB_Blk: 81664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16128 - LdsOffsetMetadata_Blk: 81664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [7, 2] - MIWaveTileA: 7 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 112 - MacroTile1: 128 - MacroTileA: 112 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 56 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 14 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 702 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT7_2_MO40_NTn1_NTA4_NTB3_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 2 - ThreadTileA: 28 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x16x64_MI16x9oEas0UOpGUOmhyse3IXS7SIr_ndKahc3Jx4Suz2H-U= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x16x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA7_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 74240 - LdsInitCVgprs: false - LdsNumBytes: 74240 - LdsNumElementsAlignedA: 71680 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 71680 - LdsOffsetB_Blk: 202752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 74240 - LdsOffsetMetadata_Blk: 202752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [7, 1] - MIWaveTileA: 7 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 448 - MacroTile1: 16 - MacroTileA: 448 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 28 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 14 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 703 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x16x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA7_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 1 - ThreadTileA: 28 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x64x64_MI16xeVBJlxCeUfN_ihjeytHEHpBd2522A2rZt89DOT-bqvw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 73728 - LdsInitCVgprs: false - LdsNumBytes: 73728 - LdsNumElementsAlignedA: 64512 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 64512 - LdsOffsetB_Blk: 195584 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 73728 - LdsOffsetMetadata_Blk: 195584 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [14, 2] - MIWaveTileA: 14 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 448 - MacroTile1: 64 - MacroTileA: 448 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 112 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 14 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 704 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 56 - ThreadTile1: 2 - ThreadTileA: 56 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x64x64_MI16xO93OcQCctQNjvd4HoNyaVKrJHjm_AEXCRD9SlC29sEY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_1_MO40_NTn1_NTA5_NTB2_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33280 - LdsInitCVgprs: false - LdsNumBytes: 33280 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [10, 1] - MIWaveTileA: 10 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 64 - MacroTileA: 160 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 5 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 705 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_1_MO40_NTn1_NTA5_NTB2_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 1 - ThreadTileA: 40 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x128x64_MI16qXhzyG-FakFAbib9NlL1w8ROnfbqFiOW7XA57Hqai5w= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA6_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 163840 - LdsInitCVgprs: false - LdsNumBytes: 163840 - LdsNumElementsAlignedA: 64512 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 81920 - LdsOffsetB: 64512 - LdsOffsetB_Blk: 146432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 64512 - LdsOffsetMetadata_Blk: 146432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [14, 4] - MIWaveTileA: 14 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 448 - MacroTile1: 128 - MacroTileA: 448 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 112 - NumLoadsA: 14 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 706 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA6_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 56 - ThreadTile1: 4 - ThreadTileA: 56 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1NCytB9k0ij3z4uf6B317BIYNTx4tr48y9CngTD1U4ZI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12480 - LdsInitCVgprs: false - LdsNumBytes: 12480 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 707 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1YtPZCOLi-DnSw5IVU7dP5eBr01NxaSBDvkq89RKa8pA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC0_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12480 - LdsInitCVgprs: false - LdsNumBytes: 12480 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 708 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC0_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1GId-X8cKHc-l_ELt7OdAoaqSR_hvch9JTa1nGl6jiY0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12480 - LdsInitCVgprs: false - LdsNumBytes: 12480 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 709 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x64_MI16x1Cd41uzHX9EztoGVefiFEAAyJdVVCF8AvQy7SMCZ8NNE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14592 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 710 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x64_MI16x10DytydjFaSLdZouDuqkxSLEH6VzuTGNc9jervTdQBOU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14592 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 711 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x64x64_MI16x1tdrQ8qpLoUomTcqwbKmBEEvcHHPcszC0bIHRtUtSkcs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 27008 - LdsInitCVgprs: false - LdsNumBytes: 27008 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 18560 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 18560 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 4] - MIWaveTileA: 1 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 712 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1UPuQ8Zwz0zy3F2ssNB85ESh41zQ_fTaxnDqDRps9bzg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA1_NTB2_NTC1_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29440 - LdsInitCVgprs: false - LdsNumBytes: 29440 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 713 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA1_NTB2_NTC1_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x12jV8nCEXICeDeNZR3rpMOdU2WL28KVKkvtHvBJEjsoE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29824 - LdsInitCVgprs: false - LdsNumBytes: 29824 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 714 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1c6s2CEIFL5M-Q5ue-1R60z2fLeGtlkMFej_fdZmZYUg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC2_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29824 - LdsInitCVgprs: false - LdsNumBytes: 29824 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 715 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC2_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1R3wMxpmrTzbHyUNQAHs0qpo4qAauWtxgQvYZS-wtR2M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29824 - LdsInitCVgprs: false - LdsNumBytes: 29824 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 716 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x64_MI16x1cURYFfuow7g066ypzbjOK3jjY2fuD-5KEaDvgcYCHyc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29440 - LdsInitCVgprs: false - LdsNumBytes: 29440 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 717 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xlcYR8juFWCw_Ix8760EPVoEwO3Roqo9_rr2p9Xkv5ek= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB4_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62336 - LdsInitCVgprs: false - LdsNumBytes: 62336 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 3 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 718 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB4_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x64_MI16x1R5V35wpj69NpM5mzUXf0Zhcai3NKdhVE_Jz7gSQ7xFY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14592 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 719 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x64x64_MI16x1SHcKpbeMDyXBrCXa7M1d0hY-5V3YQTXNV2k962zKu2Q= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB1_NTC2_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 27008 - LdsInitCVgprs: false - LdsNumBytes: 27008 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 18560 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 18560 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 720 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB1_NTC2_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1omuc73oyBR7U3NzjgQXGNljyGZxlh7dEqHfHebLJ2X0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25216 - LdsInitCVgprs: false - LdsNumBytes: 25216 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 721 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1dgun9e__Gzmtlud0Xf36rbsFBLUR_xE9XpjsbVjo4AY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA6_NTB3_NTC3_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29440 - LdsInitCVgprs: false - LdsNumBytes: 29440 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 3 - NonTemporalC: 3 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 722 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA6_NTB3_NTC3_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1B_Lyvp5dWeKixwi2gHOu9zLZ6JRmHuHAmmRBFR30n8g= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB3_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29440 - LdsInitCVgprs: false - LdsNumBytes: 29440 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 723 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB3_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x1i3QEJiVmGb5gjfuyEykQWutr2YwUiiX3ahAHDecMx8E= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA3_NTB2_NTC7_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 50688 - LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 724 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA3_NTB2_NTC7_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1nUve6Doosp5bmngRKmeuvc6llO1F-P1ilbrDBwTYSdU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24832 - LdsInitCVgprs: false - LdsNumBytes: 24832 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 725 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1BdCFJziQpw7DeYX3m0AwcEnWlr9uI3hLVhfPV4G8lLI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA1_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 54656 - LdsInitCVgprs: false - LdsNumBytes: 54656 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 726 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA1_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI16xOMPy7NClxqyBEQUKpVREQHDGJFJDf9KHH_WDlph19Ls= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC7_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 - LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 727 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC7_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI16x-gnAx6fuSUAg2DEBKpW-O5EpMF2gXA5frMYyk-me2l4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB2_NTC4_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 - LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 728 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB2_NTC4_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16qkbgejEY4-sN3GnmO2JLIUuX_8eYRonn7mqNgAy1jrI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 729 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x128x64_MI16sbD4OYYxqA_VKyhrGIGV6QLEWjZ22THuaZamVyDZ1rk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA2_NTB4_NTC2_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 107008 - LdsInitCVgprs: false - LdsNumBytes: 107008 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [10, 2] - MIWaveTileA: 10 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 5 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 730 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA2_NTB4_NTC2_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 2 - ThreadTileA: 40 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x256_MI16xX2LWdHMBiiJK0leRqPVs9QFN-C7DoG2d2I1SMPhzufs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC3_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 731 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC3_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x256_MI16x-0x9A1X6-o7F-dO-YNjYWFeZ88aJFZOUzdk6u-NNMP4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 732 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xlKsy40ZEiJZd43bJ8yzOCJAsvlRp8LbLilH7qcMbK-s= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC2_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 733 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC2_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xpthvBqJYAMQu7bVeBH4gsv2lMS_aiy2Ljzd-hbl7AIc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 734 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x256_MI16xz342g_PxsMKOAbNGNIaWZR0PuLKRdlhWo1bnpC_WjuI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 735 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x48x128_MI16x4-M9nFXM8QfT1da5UDhI3BmFjNd00BGpMdvm3u5_lyk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA1_NTB0_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65024 - LdsInitCVgprs: false - LdsNumBytes: 65024 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 13824 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 51200 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 48 - MacroTileA: 64 - MacroTileB: 48 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 736 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA1_NTB0_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xwvgqld36rRgCy8XCu2RU_zvid-zmePF_xh9lAZ7loO8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC2_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 737 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC2_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xNjBWpMn3c1heP4-ZRijEunrZs2HE1y40j8U-EfP2U2k= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB2_NTC2_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 738 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB2_NTC2_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xvejCouANmUuYraciG-6uOt0f2njtbmy4tUtKdVZGg_0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 739 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x128_MI16xk7yXgpslShWlYmKP4vu12_Y2qkIHXCYRxkeBBpLKj68= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 740 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x64x128_MI16xQWdyuM1_8TsD3HiDTMLRw0nL3MEV30at8Lf6zEwYsvw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_1_MO40_NTn1_NTA0_NTB3_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110080 - LdsInitCVgprs: false - LdsNumBytes: 110080 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 1] - MIWaveTileA: 6 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 741 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_1_MO40_NTn1_NTA0_NTB3_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 1 - ThreadTileA: 24 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x64x128_MI16xh_pjhf94hQ3YzECoL_Y99V1Y8Jm6JTWPVHKWAO2JxRk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA1_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110080 - LdsInitCVgprs: false - LdsNumBytes: 110080 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 1] - MIWaveTileA: 6 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 742 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA1_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 1 - ThreadTileA: 24 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x128_MI16RdakLaLP4h1D6BJCriDlyUbGc0GeGn8QmI0SPC83Xwg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 743 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x128_MI16FE9cjsjuCQjPTd2OQ1h0Y7_M7XPKH0wXDeSkx1FruHI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 744 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x1NtnJtSv1Fr9l6BR5Qd8-oJfWiWRZ6lDC-4KEx-A0LrI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB2_NTC6_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49920 - LdsInitCVgprs: false - LdsNumBytes: 49920 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 745 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB2_NTC6_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI16ZO7FwDptmg3_UgTeoEluKean-4jY-HbF_httdSGTmiY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 746 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x192x128_MI16RSvcZUuQAJ7HS9i0hyvaUWbChM_7uAM_mTBAzePd0F0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB1_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 144384 - LdsInitCVgprs: false - LdsNumBytes: 144384 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 55296 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 72192 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 89088 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 89088 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 192 - MacroTileA: 64 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 12 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 12 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 747 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB1_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI16kwz7SePrqsr-88UBxVyY3Nncw5xR5BFYDbTaSvEJbhw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 748 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16_b6b9vrrOsRF9FjsMoIWMKbPWc1tw4y9bNsA-zT8-xM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA2_NTB2_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 2] - MIWaveTileA: 6 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 749 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA2_NTB2_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 2 - ThreadTileA: 24 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16dbj71rapkPJ3wps8_sYFT11dMXVH_jpdCNybsnCvI4c= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA3_NTB1_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 6] - MIWaveTileA: 2 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 750 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA3_NTB1_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x128x64_MI16muNJiITd5z-woYWAMUnor_jijVfn7HMPmEZ0oF3tqa0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA3_NTB3_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 4] - MIWaveTileA: 6 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 751 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA3_NTB3_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 4 - ThreadTileA: 24 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x224x64_MI16YukZKE8Da5cmVgOebjiRra8TOGmOnkexQwMOY8G7Tw8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA1_NTB3_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 - LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 35840 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 7] - MIWaveTileA: 6 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 224 - MacroTileA: 192 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 168 - NumGlobalWriteVectorsPerThread: 84 - NumLoadsA: 6 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 752 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA1_NTB3_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 7 - ThreadTileA: 24 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x128x64_MI16RBY6LZx8bRDK-Xx7Ag9NIH7PTJYLmTmmrxghZ8rhPlY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_2_MO40_NTn1_NTA1_NTB0_NTC5_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 44544 - LdsInitCVgprs: false - LdsNumBytes: 44544 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 44544 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 2] - MIWaveTileA: 12 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 753 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_2_MO40_NTn1_NTA1_NTB0_NTC5_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 2 - ThreadTileA: 48 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x64_MI16x1zuux-pVRl-1wfzxsFqtQU4GPxQDw3gPxbhsnul0VfaE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA5_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 4 - LVCA: 8 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29440 - LdsInitCVgprs: false - LdsNumBytes: 29440 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 754 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA5_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1wgOh7myiPvMSmjC0cxkJ85G_3aBg8ZvwOa_r8QT5oq0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA5_NTB3_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 54656 - LdsInitCVgprs: false - LdsNumBytes: 54656 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 755 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA5_NTB3_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xzgUl_qnlemPLgz6lX6E9lpTMaBR2BJtFQM7YKLibM7M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB1_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62848 - LdsInitCVgprs: false - LdsNumBytes: 62848 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 16 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 756 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB1_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI32aRHb0rmz88BLMt7H-FvB4D1dm7-xNP0MDgAfTulTqCw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA6_NTB1_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99072 - LdsInitCVgprs: false - LdsNumBytes: 99072 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16640 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 757 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA6_NTB1_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 1 - ThreadTileA: 64 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16_YTPowexxKe3Jsx4qj9u4vtT3_Vi3jGEXqIrZCij8wU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA7_NTB0_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 103552 - LdsInitCVgprs: false - LdsNumBytes: 103552 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 758 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA7_NTB0_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16GsV-fziBR_T47iTI6Sz59Ao6-CUbb1-0fp9ysnl62Hg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA6_NTB3_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 112000 - LdsInitCVgprs: false - LdsNumBytes: 112000 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 7] - MIWaveTileA: 4 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 224 - MacroTileA: 128 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 112 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 759 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA6_NTB3_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 7 - ThreadTileA: 16 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xIlhxOH1Oqh9hMZmbXp3S_VoNh2dUxECCbYiRg30C--E= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA1_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65024 - LdsInitCVgprs: false - LdsNumBytes: 65024 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 13824 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 51200 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 6] - MIWaveTileA: 2 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 760 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA1_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16HNTrjnzevfXqAlxoeWd0GjjxJMyU3QHa3x7ak36qRKU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA6_NTB0_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99840 - LdsInitCVgprs: false - LdsNumBytes: 99840 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 761 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA6_NTB0_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16LxPJprZPJqL2HUBlQ1rYrzOdiZyfb2hiRAfmmzhc6-I= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA7_NTB0_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 103552 - LdsInitCVgprs: false - LdsNumBytes: 103552 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 762 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA7_NTB0_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x128x64_MI16gBsvtvKrEeTuCwRBOupt4cpfibONGYq2uPZ0SHbbfY8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA1_NTB1_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 105472 - LdsInitCVgprs: false - LdsNumBytes: 105472 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [10, 2] - MIWaveTileA: 10 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 20 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 20 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 763 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA1_NTB1_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 2 - ThreadTileA: 40 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x384x64_MI16nmuzXpgsMN9AfDrnVukONHqfCmykHd7p7Kgqegj8TB4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x384x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA2_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 50688 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 84480 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 84480 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 384 - MacroTileA: 128 - MacroTileB: 384 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 12 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 12 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 764 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x384x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA2_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16_IqOxiGBDNP8ml49xazJLcAIIB2zcLhzL3v-NRUmSqk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 765 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x16x64_MI16xaHIUqumpYMOd_eSWVGafheovipUJxlMuIr2-iTHYjGI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA5_NTB0_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 4 - LVCA: 8 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51968 - LdsInitCVgprs: false - LdsNumBytes: 51968 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [8, 1] - MIWaveTileA: 8 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 766 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA5_NTB0_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1cP6IgbIVYUJrrrs2fHm3OtbSZ3P2bg2FSpaoSmHcKf8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24832 - LdsInitCVgprs: false - LdsNumBytes: 24832 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 767 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x48x64_MI16xmH7RnjKq3nvhzyoQ47EcGZgIa8QRqzxrV8MdEDpU-ZE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 6912 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 51200 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 48 - MacroTileA: 128 - MacroTileB: 48 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 16 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 768 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT112x64x64_MI16xxLO83H1JHUDs3HCJ0kcE-UkraK1VFdP49cA-cwHsgTM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA2_NTB0_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 16128 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16128 - LdsOffsetB_Blk: 48896 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16128 - LdsOffsetMetadata_Blk: 48896 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [7, 1] - MIWaveTileA: 7 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 112 - MacroTile1: 64 - MacroTileA: 112 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 28 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 14 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 769 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA2_NTB0_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 1 - ThreadTileA: 28 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16x5QMdH8YrjjeCSh_avPALQzY5SB_v1EPRRtI5SfrFqqQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB1_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62848 - LdsInitCVgprs: false - LdsNumBytes: 62848 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 16 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 770 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB1_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x16x64_MI16xmIZJTXVn3RUnJZbpj_qqvGQHG7gFV19O_PCjRx4HjtM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x16x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT7_1_MO40_NTn1_NTA6_NTB0_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 74240 - LdsInitCVgprs: false - LdsNumBytes: 74240 - LdsNumElementsAlignedA: 71680 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 71680 - LdsOffsetB_Blk: 202752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 74240 - LdsOffsetMetadata_Blk: 202752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [7, 1] - MIWaveTileA: 7 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 448 - MacroTile1: 16 - MacroTileA: 448 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 28 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 14 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 771 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x16x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT7_1_MO40_NTn1_NTA6_NTB0_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 1 - ThreadTileA: 28 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT416x32x64_MI16xb3XFV5kpSoh75iGZ49SShhlugugjIq5b23TsJ-Lk_dE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT416x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT13_1_MO40_NTn1_NTA5_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 54912 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 54912 - LdsOffsetB_Blk: 120448 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 54912 - LdsOffsetMetadata_Blk: 120448 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [13, 1] - MIWaveTileA: 13 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 416 - MacroTile1: 32 - MacroTileA: 416 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 52 - NumGlobalWriteVectorsPerThread: 52 - NumLoadsA: 13 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 13 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 772 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT416x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT13_1_MO40_NTn1_NTA5_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 52 - ThreadTile1: 1 - ThreadTileA: 52 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x64x64_MI16x9RLO9cqrWVBnf5WW56Yguk0AKiGz8o1xu1IbeQC3gHY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_1_MO40_NTn1_NTA5_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 42496 - LdsInitCVgprs: false - LdsNumBytes: 42496 - LdsNumElementsAlignedA: 32256 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32256 - LdsOffsetB_Blk: 97792 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 42496 - LdsOffsetMetadata_Blk: 97792 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [14, 1] - MIWaveTileA: 14 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 64 - MacroTileA: 224 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 56 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 7 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 773 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_1_MO40_NTn1_NTA5_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 56 - ThreadTile1: 1 - ThreadTileA: 56 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x96x64_MI16xyPY5B5hOQbVhJsxdxS4er--OQ-G5NLIqCO2zb6wLrrM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x96x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA6_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 - LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 35840 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 35840 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [7, 3] - MIWaveTileA: 7 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 96 - MacroTileA: 224 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 84 - NumGlobalWriteVectorsPerThread: 84 - NumLoadsA: 7 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 774 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x96x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA6_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 3 - ThreadTileA: 28 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xGRJ948PW2GY3Oh-BgL5E3cP96ILTWrBUyogE0huOZFk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 775 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM16_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x48x256_MI16x32d0yrzBuc6cQz2druL4nmWJZCsoXcp2TWx6BH5c44M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 107776 - LdsInitCVgprs: false - LdsNumBytes: 107776 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 48 - MacroTileA: 32 - MacroTileB: 48 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 6 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 776 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x80x256_MI16xFfV_Lyi8V7Vj9rgZw1A__J5um7q_2JfLpTqYZudeVf8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x80x256_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA2_NTB5_NTC2_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60928 - LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 43520 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 5] - MIWaveTileA: 1 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 80 - MacroTileA: 32 - MacroTileB: 80 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 2 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 10 - NumGlobalWriteVectorsPerThread: 10 - NumLoadsA: 4 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 777 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x80x256_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA2_NTB5_NTC2_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 5 - ThreadTileA: 4 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x160x128_MI16AbMFVKD5zbFEAS433D1hwrSvtHdmdgk3gF2Lnchkhdg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x160x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA3_NTB4_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 55296 - LdsInitCVgprs: false - LdsNumBytes: 55296 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 74752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 55296 - LdsOffsetMetadata_Blk: 74752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 5] - MIWaveTileA: 1 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 160 - MacroTileA: 32 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 20 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 2 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 778 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x160x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA3_NTB4_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 5 - ThreadTileA: 4 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x512_MI16xGQWs24PV7Ym1YlzWNb85RVCaZ6eCHgA5WP9fLETVl0M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB6_NTC3_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 3 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 779 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB6_NTC3_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x512_MI16xS5DY8moH66uzpr6SJxsOicRUOH_OQ0U6eqycBbQQpKk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 780 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x256_MI16xK2rvBSaK8FYS6JPr6NUxMFfDsXUoeePQtJqs_k2sb-k= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB6_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 781 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB6_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x512_MI16xjr5IN7ix-0abROpjyU0ooEpE9LYZoM-aJ7Ri-cIzzGk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB4_NTC2_NTD7_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115712 - LdsInitCVgprs: false - LdsNumBytes: 115712 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 98816 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 782 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB4_NTC2_NTD7_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x512_MI16xsTI5BWTuHboBtEqod2TVB8lK-ydy8-gnlqHKaYHgf9Y= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC3_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 783 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC3_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x256_MI16x2GFdk17QQ3uA1d6t9wU-zMO5ILQ12zaI9jC-3SbeW3w= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB6_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 784 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB6_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x160x64_MI16xIEJtM0Qs-Qt5ANRrJRe1y6NvM51H-1dK68AJYkLeue8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA2_NTB7_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63104 - LdsInitCVgprs: false - LdsNumBytes: 63104 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 160 - MacroTileA: 64 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 785 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA2_NTB7_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x256_MI16xeEawEtm9tN_EZpDn4_hzdfTUqHTyl86g1Ec6oFWG5Ko= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA3_NTB6_NTC0_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 786 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA3_NTB6_NTC0_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x96x128_MI16x3s3lr2-RoTRlIwDQjjn2ZSVERgN7-naW9cscRwbJKW4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 55296 - LdsInitCVgprs: false - LdsNumBytes: 55296 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 55296 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 96 - MacroTileA: 96 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 787 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x32_MI16xrGuCXEyJQELfLKWrgj38v3ZHz02h1cWtKueGMrvcM6g= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 7680 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 788 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x96x64_MI16x7PXNBPx-awrBePFiQtA7_cPxzuUzObGAITEOG6nBd_I= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA3_NTB7_NTC7_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 112000 - LdsInitCVgprs: false - LdsNumBytes: 112000 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 3] - MIWaveTileA: 8 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 96 - MacroTileA: 256 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 7 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 789 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA3_NTB7_NTC7_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16ZqWfdAnZESKd0mXkLxfL_F0MIMePupboVSfGTQic0Es= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA2_NTB7_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 107776 - LdsInitCVgprs: false - LdsNumBytes: 107776 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 3] - MIWaveTileA: 8 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 790 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA2_NTB7_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16UZBV0Y--yvpSgMgqfad2-IgosEc_9T8mQsqktcRmzOE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC7_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 7 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 791 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC7_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x128_MI16xDKx3iHWUtLm2OzQG30OfE_ZHq_7AQXRRMtC4or3g8qc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24832 - LdsInitCVgprs: false - LdsNumBytes: 24832 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 792 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xVS6-8aSM6wy2k6RWi_rYvzRomSA3jlwT8gPLCp-eAwk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 793 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x16x64_MI16x1tn-CMhx1xYKpMgOqCcpBalQhLAky9RcYoTGCvj9LiAs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 27136 - LdsInitCVgprs: false - LdsNumBytes: 27136 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 794 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x256_MI16xpaqLB2iptpeDBUIZq9RHY34zWEG6sfG_iD3vwCS6rvc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 100352 - LdsInitCVgprs: false - LdsNumBytes: 100352 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 795 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1ST08a-lQgNJzT-bJf5BENGSqbDLJjbqK9ZONXtkOIO0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB1_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 54656 - LdsInitCVgprs: false - LdsNumBytes: 54656 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 796 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB1_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT80x64x256_MI16xIqItKtjOeUz3Ra2yL7eq9_6MF0QGtPXxrqjl9UDVFBI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 156672 - LdsInitCVgprs: false - LdsNumBytes: 156672 - LdsNumElementsAlignedA: 43520 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 78336 - LdsOffsetB: 43520 - LdsOffsetB_Blk: 121856 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 121856 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 1] - MIWaveTileA: 5 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 64 - MacroTileA: 80 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 20 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 10 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 797 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 1 - ThreadTileA: 20 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT80x128x128_MI16a1GgaJ1XhpWBaiZAklfYPebdZd3fO_FZZ5JDkBjcRSc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 2] - MIWaveTileA: 5 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 128 - MacroTileA: 80 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 5 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 798 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 2 - ThreadTileA: 20 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x160x64_MI16Tj46MrSmcjsXsLpb8F4XMOc9k2TfvJyIN75AYpc9rNk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA2_NTB1_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53248 - LdsInitCVgprs: false - LdsNumBytes: 53248 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 25600 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 53248 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 5] - MIWaveTileA: 6 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 160 - MacroTileA: 192 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 120 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 6 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 799 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA2_NTB1_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 5 - ThreadTileA: 24 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT288x96x64_MI16xcW6aDiVO9kNXRsYsIYuAvUGZD4Wy-UMkRlFyQ9a6KBc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT288x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT9_3_MO40_NTn1_NTA3_NTB4_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 111616 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 111616 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [9, 3] - MIWaveTileA: 9 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 288 - MacroTile1: 96 - MacroTileA: 288 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 108 - NumGlobalWriteVectorsPerThread: 108 - NumLoadsA: 9 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 9 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 800 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT288x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT9_3_MO40_NTn1_NTA3_NTB4_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 36 - ThreadTile1: 3 - ThreadTileA: 36 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xR-0zy6tezCC3idb-nW6ozYgH4I_DR2dqYJ-S39GhMmA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC2_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 801 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC2_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xLQozifCqpnSazKcFwV_qVVGa5O46gmyk__cb9RHQQ8g= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC3_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 802 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC3_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xKO0dNQa5bcn8HBJvyQ27-6Spm9wJOYkxisdXIhwkEfI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62336 - LdsInitCVgprs: false - LdsNumBytes: 62336 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 803 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x96x128_MI16xcQJaylFGXq_0TX9Hl4UpLe98bgmSD52B0suD_JFJXdc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA2_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120832 - LdsInitCVgprs: false - LdsNumBytes: 120832 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 96 - MacroTileA: 96 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 804 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA2_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x64_MI16x1XSaY9E04TUGhE7jaKwxVi1iMJUyzWT0p2SC6I5Xq3dI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29440 - LdsInitCVgprs: false - LdsNumBytes: 29440 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 805 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI16xlo5dPaFD1Z2wNLfWpqUVXrwi4qp1VyXkrK1mHg9vrBk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 - LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 806 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16not8US0zBp6KQnvU6T7iGKsGOpHJyy6OXpjVcrh8di0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA3_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 125440 - LdsInitCVgprs: false - LdsNumBytes: 125440 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 4] - MIWaveTileA: 12 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 24 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 24 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 807 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA3_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1h7E_d5sJJteGIIjL-Lj0Y2uruXc0_DPC4873LrLNlQc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA3_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 159744 - LdsInitCVgprs: false - LdsNumBytes: 159744 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 79872 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 113664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 113664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 808 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA3_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x192x128_MI166Yhc8rndgFfNj72uWs6aJ8gw1o3E1jyb1Xl1FO_HXJc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x192x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA3_NTB1_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 162816 - LdsInitCVgprs: false - LdsNumBytes: 162816 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 55296 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 81408 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 107520 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 107520 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 3] - MIWaveTileA: 6 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 192 - MacroTileA: 96 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 72 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 12 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 12 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 809 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x192x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA3_NTB1_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 3 - ThreadTileA: 24 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x64_MI1653RT6YX0qL65pbCWKVRSCXEybm7S--Kgpdzo9IAq34I= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB0_NTC7_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120832 - LdsInitCVgprs: false - LdsNumBytes: 120832 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 6] - MIWaveTileA: 6 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 192 - MacroTileA: 192 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 810 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB0_NTC7_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 6 - ThreadTileA: 24 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16lyiZ_OWA7CBIxG5UQu-LoZ4PeF9fqQR2IL76Gm18yq4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA0_NTB0_NTC6_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 811 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA0_NTB0_NTC6_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16FpHBpJztLFdPOpQm5LgXjXS6thXX1EofipNUGlAccrM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA2_NTB1_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 812 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA2_NTB1_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1peVBAmQGI6Gk8S474Te1ZTRXqknwudSRUC27VYvODes= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12480 - LdsInitCVgprs: false - LdsNumBytes: 12480 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 813 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI32x3or2ZfDxoEVPyIE9ENx34n2Ta6maweLlfi_muaxl1jt4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49792 - LdsInitCVgprs: false - LdsNumBytes: 49792 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8320 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 814 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16F92daEiSH1niks6NSxdTY8_WZekz2j79_GMV5tkmidw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 103552 - LdsInitCVgprs: false - LdsNumBytes: 103552 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 815 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16O-lmJvNZ1yZAl5hwmLIls_3x7xZrDhodvBwMedgcR8s= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB3_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 816 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB3_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x128_MI16f28emv7KjCxL7hEsAfYMGEjDqXKAlsfhf-zSWgM_SE4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA4_NTB2_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 817 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA4_NTB2_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3WK9qxSffVXxVRK0eEFODtLFet-BzA4MlNP4jvdkjtYU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA5_NTB0_NTC6_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 136192 - LdsInitCVgprs: false - LdsNumBytes: 136192 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 68096 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 818 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA5_NTB0_NTC6_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 1 - ThreadTileA: 64 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16mttDLu8BtirRNUBQVYgcmsN4b0XMQewoI9Uyje1UTQM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA4_NTB2_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 2] - MIWaveTileA: 6 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 819 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA4_NTB2_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 2 - ThreadTileA: 24 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3X3GmYl-7ra3jIh9fVES1YTKpNJhlxYauDVyCOafbM_k= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 820 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1MSg7HvaEv6UulCPk3flAi9a5H9WOAXEFX3ej6sG_23Q= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_10_MO40_NTn1_NTA1_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 156672 - LdsInitCVgprs: false - LdsNumBytes: 156672 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 43520 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 78336 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 113152 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 113152 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 10] - MIWaveTileA: 2 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 821 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_10_MO40_NTn1_NTA1_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 10 - ThreadTileA: 8 - ThreadTileB: 10 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16G9rWlHGj8AkE0shTbmuyb8kBmQeD0vtA-3rtxqskVSo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA5_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 6] - MIWaveTileA: 4 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 822 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA5_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 6 - ThreadTileA: 16 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI165-7blOixwN1FuEz7zrKBwR46Fnw4Xw_XU9I4GbQUS_M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA7_NTB2_NTC6_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 118784 - LdsInitCVgprs: false - LdsNumBytes: 118784 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 35840 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 7] - MIWaveTileA: 4 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 224 - MacroTileA: 128 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 112 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 823 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA7_NTB2_NTC6_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 7 - ThreadTileA: 16 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16c39Ae3LGGzg5uhOwIVro1DsGeSOQFAOSrUr5RBywozg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 824 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16wj34IQ8H1F7QK2OFVXhCekHWMDqTfcCtKjlW7CWIPNo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB0_NTC2_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 825 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB0_NTC2_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32qUwpgF5DFZUbxYcCe9WC2bRLLCw40-B2Qejmz10YiQo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 - LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 826 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16JqQbv0miPuBjuGTO1QiYzu8EIEL_UJAOO0xeIxhcEbs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB3_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 827 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB3_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16nI3ni6kxE02eaceTyJPHQfs7yo8MOR6rn7fXPUu2ngU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA3_NTB1_NTC3_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 4] - MIWaveTileA: 12 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 828 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA3_NTB1_NTC3_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16831DzVdHpcAfN06nMnpuZxaklAEKQXJeyS_IeVQlloc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA1_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 24 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 24 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 829 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA1_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1PcicnN4rhPttnOPnqCBtYuhygrtG2QIv9rps9xbG1-w= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 830 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16zFtrwhsejERtXD8XH0g2fVim3OqNCT3BFOFeYtWolT4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA5_NTB3_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 6] - MIWaveTileA: 4 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 831 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA5_NTB3_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 6 - ThreadTileA: 16 - ThreadTileB: 6 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI163EUBhVCu7YU8YiB7B3wxOFkCvzvOOK-aaTXEsimCDG0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB3_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 47616 - LdsInitCVgprs: false - LdsNumBytes: 47616 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 30720 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 47616 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 3] - MIWaveTileA: 8 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 832 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB3_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16a-E-7tb-ZO9KJftGvW0wAT7_6vQqw3M4FpLKjRsuatA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB3_NTC5_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 - LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 833 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB3_NTC5_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x128x64_MI16IFqTcGgWxKaegNxP_uRujs8TxMJusMTslwWMuxlGTNk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT10_2_MO40_NTn1_NTA0_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41472 - LdsInitCVgprs: false - LdsNumBytes: 41472 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41472 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [10, 2] - MIWaveTileA: 10 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 5 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 834 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT10_2_MO40_NTn1_NTA0_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 2 - ThreadTileA: 40 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16Lf7yMV7SfSJQN_Rq9dH2jSQ03y3QAAOcvNPbPiBs8gU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 835 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI164h-9S6LBBQbawdsG_igSxm0hlDyJtLl8drjlKCn9Sbs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 836 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16rRMDIrqhG8IEfbyOeh_QKMWXhQ7OssLMSua0QphqBuA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 107776 - LdsInitCVgprs: false - LdsNumBytes: 107776 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 3] - MIWaveTileA: 8 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 837 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT8x512x32_MI4x4xwGexg6NUAYSiNzq6JH2hfgr3rNJK4soBJJB-MXGLwqI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT8x512x32_MI4x4x16_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB1_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG4_64_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 37504 - LdsInitCVgprs: false - LdsNumBytes: 37504 - LdsNumElementsAlignedA: 640 - LdsNumElementsAlignedB: 36864 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 640 - LdsOffsetB_Blk: 66176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 37504 - LdsOffsetMetadata_Blk: 66176 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 32 - MFMA_BF16_1K: true - MIArchVgpr: 1 - MIBlock: [4, 4, 4, 16, 1, 16] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 8 - MacroTile1: 512 - MacroTileA: 8 - MacroTileB: 512 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 16 - MatrixInstBM: 1 - MatrixInstBN: 16 - MatrixInstK: 4 - MatrixInstM: 4 - MatrixInstN: 4 - MatrixInstruction: [4, 4, 4, 16] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 838 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT8x512x32_MI4x4x16_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB1_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG4_64_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 1 - SubGroup1: 256 - SubGroupA: 1 - SubGroupB: 256 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [4, 64, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x128_MI16x0shvE3W20sfkGsqjkGAyg6djwqaSgPHwUixXD3lRH6I= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA3_NTB1_NTC6_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 4 - LVCA: 16 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58880 - LdsInitCVgprs: false - LdsNumBytes: 58880 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 839 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA3_NTB1_NTC6_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x144x64_MI16KX7iYmMaGYl8nu7g4CVDr0rGfHiYuGHGAZaIlxOe0vg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x144x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA2_NTB5_NTC3_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120064 - LdsInitCVgprs: false - LdsNumBytes: 120064 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 20736 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 9] - MIWaveTileA: 4 - MIWaveTileB: 9 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 144 - MacroTileA: 256 - MacroTileB: 144 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 3 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 8 - NumLoadsB: 18 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 18 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 840 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x144x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA2_NTB5_NTC3_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 9 - ThreadTileA: 16 - ThreadTileB: 9 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16TyXNsJWi7nS-fhS0PXr9_wo25310Nj8ClA6t3ipliVQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 4 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 841 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16Jygg6B3xFpjouJs-J7lNnFqbr_fjHkpVkG7TeVAeGxw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB7_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 842 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB7_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI1669pYsjaG-bhznKUtGHu2lf-bBXpxPXs6Ova8-wIDIt0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB6_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 6 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 843 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB6_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x128_MI16xs7-6yhjj1HNkkgTrM--grrsZfgn_E6vCNGAM1q6_bH0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB1_NTC3_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 4 - LVCA: 16 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58880 - LdsInitCVgprs: false - LdsNumBytes: 58880 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 844 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB1_NTC3_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x64x32_MI16x1il9kR7gtAdgfECIrzvR6BT10ESmOvAbBH1EZq9McV9o= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC1_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13568 - LdsInitCVgprs: false - LdsNumBytes: 13568 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 9344 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 845 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC1_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT384x160x64_MI16R6ufUzTOAsbT90lr9NkBtbosxb-cN2_TH1hLdM-Dexk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB1_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 146688 - LdsInitCVgprs: false - LdsNumBytes: 146688 - LdsNumElementsAlignedA: 52224 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 73344 - LdsOffsetB: 52224 - LdsOffsetB_Blk: 125568 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 52224 - LdsOffsetMetadata_Blk: 125568 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [12, 5] - MIWaveTileA: 12 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 384 - MacroTile1: 160 - MacroTileA: 384 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 48 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 48 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 846 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB1_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 5 - ThreadTileA: 48 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16wwr4HSZThMg9UFZzWEWSfwJ94L4DnKO_w2bbGpU0T2I= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB2_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 847 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB2_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x32_MI16x1CkA53Ue8YbA1i51n1-TuEmeUsso4seYgPmxkN3dKByo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: false - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB0_NTC5_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 15360 - LdsInitCVgprs: false - LdsNumBytes: 15360 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 848 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB0_NTC5_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xFPsdLYz1dTSwqg2G0kCB_cwzxZZmZnKIf6S528fVuxQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB7_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 849 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB7_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x32_MI16x1pxdxB7UzAAXNDQuUvHLJ9h-cxAxC2V8kba8-ffvW4ms= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC5_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 10240 - LdsInitCVgprs: false - LdsNumBytes: 10240 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 850 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC5_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16x7GJuR3bjr01Bn5Hf-sWoQ6LJVu-AfpXaH9dH5ASE18s= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB4_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 - LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 851 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB4_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x32_MI16x1DjaUH_vsJx9v9dbE0O3rvNbppHjy8gK0goRMSQdisjE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 - LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 852 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x32_MI16xrNceqmal1vdzvtYc91d52ZnBJgSpU1Rdp4kEJiBIQM8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA1_NTB2_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29056 - LdsInitCVgprs: false - LdsNumBytes: 29056 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 1] - MIWaveTileA: 8 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 853 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA1_NTB2_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xk52S4ad-TevDChGWNhPb1yoNNAIq5pQ-fdl5f8rEbQM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC6_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58880 - LdsInitCVgprs: false - LdsNumBytes: 58880 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 854 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC6_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x32_MI16x_hS5WClhqnmWkhPlPPC32KhHn-1rAZBvooAf1TMGx8w= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14336 - LdsInitCVgprs: false - LdsNumBytes: 14336 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14336 - LdsOffsetMetadata_Blk: 25600 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 855 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x128_MI16iVIYZb1BYuBA6lEYwxwvZ7109NzN99TD6TIDLsTQRVQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB1_NTC6_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 856 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB1_NTC6_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16x71aO-s1g_2o51T8HyLV9LYsofU877J6GJdCwBJQpGkQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA1_NTB2_NTC6_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 1] - MIWaveTileA: 8 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 857 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA1_NTB2_NTC6_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x128_MI16xS2jMbDtQ5_F3P86CkrJHRZTDOo-7OylhPwPhVJb2OL8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA1_NTB1_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 858 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA1_NTB1_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16BFaR0ZYbZkhwWjT_Ba8i0NCAy6KAmcxHOp6SgAHg2_o= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA2_NTB1_NTC5_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 107776 - LdsInitCVgprs: false - LdsNumBytes: 107776 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 3] - MIWaveTileA: 8 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 16 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 859 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA2_NTB1_NTC5_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16KPP61EtqBNTlRJ-I_aAukkPlJtkB7oO8oAgiQ2R0llw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB6_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 7 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 860 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB6_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16Fw_dvYNukHFKOVviLuxEVKNl1KXvcgtWMXYpu_RV9bg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB3_NTC7_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53248 - LdsInitCVgprs: false - LdsNumBytes: 53248 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 35840 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 53248 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 7] - MIWaveTileA: 4 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 224 - MacroTileA: 128 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 112 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 861 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB3_NTC7_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 7 - ThreadTileA: 16 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1fVSXE7Hp2-FCWO4DdLL1bSkdGk5CHnY7g80tz5B_EqA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12480 - LdsInitCVgprs: false - LdsNumBytes: 12480 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 862 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x128_MI16xBe8y_mZMAbiXjK4oCAn0nsuNlixq212gvIUfboQHg4k= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC2_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24832 - LdsInitCVgprs: false - LdsNumBytes: 24832 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 863 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC2_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1pQSIDqCoQx3_Ri79K5ad2lS5jlijd7Sp5JLVMDsb73Q= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24832 - LdsInitCVgprs: false - LdsNumBytes: 24832 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 864 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x128_MI16xQUi52S7VSdGJDn5Bpm7RKSM5k6RLZWm__jwH2YQ0HgQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24832 - LdsInitCVgprs: false - LdsNumBytes: 24832 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 865 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x320x32_MI16PMVTS3yNba5IV3cNNVe8WrfKpppfUC2VxgXy7Itzw2Q= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x320x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA2_NTB3_NTC5_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63104 - LdsInitCVgprs: false - LdsNumBytes: 63104 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 10] - MIWaveTileA: 4 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 320 - MacroTileA: 128 - MacroTileB: 320 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 866 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x320x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA2_NTB3_NTC5_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 10 - ThreadTileA: 16 - ThreadTileB: 10 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1z-qFNfK0ass6s-mBeCbn2T-UhuXzV92gtCkJ2GTXgUE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12480 - LdsInitCVgprs: false - LdsNumBytes: 12480 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 867 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xYSECTInVbJDnbiylgv7Bz8iWJfwZ9tNFBp-qDs3Q8Zg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC3_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51712 - LdsInitCVgprs: false - LdsNumBytes: 51712 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51712 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 3 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 868 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC3_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x224x64_MI16mkECGi4PfVq_DSg4bBQNpVTBWlPse1LKEVgkFINTS_M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB6_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 - LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 35840 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 7] - MIWaveTileA: 6 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 224 - MacroTileA: 192 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 168 - NumGlobalWriteVectorsPerThread: 84 - NumLoadsA: 6 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 869 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB6_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 7 - ThreadTileA: 24 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x288x64_MI16sEKHjZV32z0wCgP-Ck4vxRRsykPsLTvAc2SuYHI8fYk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA3_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 73728 - LdsInitCVgprs: false - LdsNumBytes: 73728 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 158720 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 73728 - LdsOffsetMetadata_Blk: 158720 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 9] - MIWaveTileA: 6 - MIWaveTileB: 9 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 288 - MacroTileA: 192 - MacroTileB: 288 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 216 - NumGlobalWriteVectorsPerThread: 108 - NumLoadsA: 6 - NumLoadsB: 9 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 9 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 870 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA3_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 9 - ThreadTileA: 24 - ThreadTileB: 9 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16YJhfH7jHUovMMi6uEf7-a6s9RhmceVmNYR2wLw7yChQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA1_NTB3_NTC7_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 77312 - LdsInitCVgprs: false - LdsNumBytes: 77312 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 51200 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 157184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 77312 - LdsOffsetMetadata_Blk: 157184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 5] - MIWaveTileA: 12 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 320 - MacroTileA: 192 - MacroTileB: 320 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 6 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 871 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA1_NTB3_NTC7_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 5 - ThreadTileA: 48 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI161NTL0auO2BZyLodIslMIR1DwO0ELH4sdtwc7edrEZa4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60928 - LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 4] - MIWaveTileA: 12 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 872 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16RnRjOTmkiumEFZRZUcGRxBp2kZmZrxMRoo6KyHD2Tbw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA2_NTB5_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 77312 - LdsInitCVgprs: false - LdsNumBytes: 77312 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 51200 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 157184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 77312 - LdsOffsetMetadata_Blk: 157184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 5] - MIWaveTileA: 12 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 320 - MacroTileA: 192 - MacroTileB: 320 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 4 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 24 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 24 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 873 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA2_NTB5_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 5 - ThreadTileA: 48 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16nj2MHcRCzzxakd1nPd929ZQwVRlEwGsPVIXZzv0nzwo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA2_NTB3_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 25344 - LdsNumElementsAlignedB: 42240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 25344 - LdsOffsetB_Blk: 92928 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25344 - LdsOffsetMetadata_Blk: 92928 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 5] - MIWaveTileA: 12 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 320 - MacroTileA: 192 - MacroTileB: 320 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 6 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 874 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA2_NTB3_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 5 - ThreadTileA: 48 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16x0I8NwzTiKxiFywgzFRbmksUFYxRYGXCVu8nZ_ZarTes= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC1_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 875 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC1_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16LdT3Ok_31-UGAeeLmLhm7J_PfRw3U1qnSbzalcKhsDE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 - LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 5 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 876 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16WKRGsmaOqbeAJEIx4DocFL4x5TrEqHvywiHDMcQwGfQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB5_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 - LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 3 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 877 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB5_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16XDkV1WWdgWCp28uTkfX-ZMj-6qQkpDx3PqWeyjXkBTU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 - LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 878 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16LONw0lPkipYqJjxTghhorV3ImHRYpxKO0mmzF67V2QE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB6_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 - LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 879 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB6_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI161P-AfpKbDA0dd-RJGocfqkcL14EjTPhFXXFVBZp9C2w= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 880 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16UL-0jXrB-iJUBJ6z2nuNkq7qNwG4Umv45OTrMUHD_Sg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA3_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 16] - MIWaveTileA: 4 - MIWaveTileB: 16 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 881 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA3_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 16 - ThreadTileA: 16 - ThreadTileB: 16 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI32mMYwjHl1gVR-fkug0zGR3LxCuZZ-9gAO-iqfZWSDU6A= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 - LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 882 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xZxuiwjUCkloWJns9y2HurS-nLYzTfpPUANwjQ59sWfs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 100352 - LdsInitCVgprs: false - LdsNumBytes: 100352 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 883 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x256_MI16xBXEA7QLtW3O9ZsvGfaJqMm5Y9FOpoF3EFoggHLq7JtM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 884 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x32_MI16YyEvEgPf2u14k5vjILjzibCKSBhwfrPuXLaNFW_Wrns= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB3_NTC4_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 100864 - LdsInitCVgprs: false - LdsNumBytes: 100864 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17920 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 885 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB3_NTC4_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI163pH1vLow3EsN2uQ-opRuYXi1hCiREgsNFKkSuy5vILk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 886 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x128_MI16mPtFI8cyGaBSop0zx9aHg6nxxsRXMm0hN1_8JohpoXk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB1_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 - LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 887 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB1_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16H5rgVvmhoTBq8gkcv1oEUSoSiFuvEx01LilKaZ8NaB4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 888 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16DiVvEf0br81UZSidSMu5XE0YOZ0sVEbKyte4Rsents8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 889 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI328z7zlrPZoNFWjRgoyWQ6B8tO670SBJPW6w9Uj7PbVAE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA1_NTB3_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 136192 - LdsInitCVgprs: false - LdsNumBytes: 136192 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 68096 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 890 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA1_NTB3_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 128 - ThreadTile1: 2 - ThreadTileA: 128 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI167_rpY4LIMTOrIm1LuMEtZDMEAg2DwRP8EJVrJOoqzGE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC7_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 891 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC7_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16y2-BM5sQUywvo0UBxMHcKo0NNjayeizkxV9zqVnpXFc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 892 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x128_MI16xms5UVBAC-5pKvf1spjSRSVxkYaWKIBiWqZuQ2LImnzc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9216 - LdsInitCVgprs: false - LdsNumBytes: 9216 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 893 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x128_MI16xiQoTBpOt_KCP21uwLsp7NPSJR81sV7RqxGZk5C2qlX4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC2_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9216 - LdsInitCVgprs: false - LdsNumBytes: 9216 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 7 - NonTemporalC: 2 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 894 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC2_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16ppuM5ImiGAl39QcT8TshVstt5olMsWZONgDjC47gkgQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 895 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16TRkaxmxgVEaCXigbsd4lSs-2bWmBfCiCaTIkhvyXtjQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA6_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 896 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA6_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16DLK_b4BqmIOviXWtqdwvDutysymqgIRyfJqzydL2m1M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA5_NTB0_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 897 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA5_NTB0_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16wNIN879MSkh2qRAl8HYpv_Rpz5N83A-i1GYbj0niHMM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA5_NTB3_NTC3_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 3 - NonTemporalC: 3 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 24 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 24 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 898 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA5_NTB3_NTC3_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x32_MI321okaBOV1fu9vWQNAk5Oq47xiUzXFyw7FwdjXTK43yS8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32256 - LdsInitCVgprs: false - LdsNumBytes: 32256 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 46592 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32256 - LdsOffsetMetadata_Blk: 46592 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 2] - MIWaveTileA: 6 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 899 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 96 - ThreadTile1: 2 - ThreadTileA: 96 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 3 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI168VDRIrx23PTuC6lhze-A-0APG55SFM3_sqQjioo3is8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB2_NTC4_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26624 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 900 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB2_NTC4_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16x_6WVJaqJvDAy3oUIuceulyLwFp9Nt9m6fESDfdDIEk0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 1] - MIWaveTileA: 8 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 901 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xiserYeQLI7kbxfk2Liw0IzS1NzNpjEwTQqu2GknzFiU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 27136 - LdsInitCVgprs: false - LdsNumBytes: 27136 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27136 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 1] - MIWaveTileA: 8 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 902 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16f5aXipoZHEFnpnwprApAAp-vVGXpbr5VjryeZd_ZZYw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 903 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x128x32_MI32sDHpLp_qRcXmLqvyJAA4COKfyZCAdYezAEba9LM73SU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x32_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB0_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 - LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 512 - MacroTile1: 128 - MacroTileA: 512 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 904 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x32_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB0_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 128 - ThreadTile1: 2 - ThreadTileA: 128 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x16x512_MI16xHav13Ay7faFoeK4BhJe8UYA01ZshuvwPXlxS_wY_B-Y= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x512_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 84480 - LdsInitCVgprs: false - LdsNumBytes: 84480 - LdsNumElementsAlignedA: 67584 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 67584 - LdsOffsetB_Blk: 198656 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 84480 - LdsOffsetMetadata_Blk: 198656 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 512 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 905 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x512_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16q8NEDpD13SSNUu5bllAIXsJO-kShQHfpJNFTTz1XaoQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 906 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xWRRxsqJ7jMcN3RhwdtlcrN_appCXI3uzm_0C6uSkDMk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB2_NTC3_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 907 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB2_NTC3_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x512_MI16xUxsmMWfC7lMbayJUFJtt3uhGmPoX5bqgzvqCGGL3ytI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 908 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x512_MI16xRRDqwALEE4LHHM1M4ceEUH0APTRcUTuF9PVaZJmnhT0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 909 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x512_MI16xW4SabINezihuadnkWBODMFyHK6ApsCWB1xnbnvs0qI4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 2048 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115712 - LdsInitCVgprs: false - LdsNumBytes: 115712 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 910 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x32x128_MI16xizr7Q8_nCrHLOtGHAAVLDiIPvJtsXUwahODl32ZXFqU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_1_MO40_NTn1_NTA4_NTB2_NTC5_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 25344 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25344 - LdsOffsetB_Blk: 90880 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25344 - LdsOffsetMetadata_Blk: 90880 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [6, 1] - MIWaveTileA: 6 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 32 - MacroTileA: 96 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 12 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 12 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 911 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_1_MO40_NTn1_NTA4_NTB2_NTC5_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 1 - ThreadTileA: 24 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x16x256_MI16x6CocHSVfjUtMgJadpMnsKR32nEDOIAmuC97s9-SGqFk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 109056 - LdsInitCVgprs: false - LdsNumBytes: 109056 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 8704 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 912 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x32x64_MI16xAL4LlplqX-lASFEbhgTbShC-GDPkvccQf2jTTz-nVfg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x32x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA3_NTB3_NTC5_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 38912 - LdsInitCVgprs: false - LdsNumBytes: 38912 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 38912 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 1] - MIWaveTileA: 8 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 913 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x32x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA3_NTB3_NTC5_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x128x64_MI16yj-r8sQkpPX2ARVU-HAhTbcq98ZTuNBX4v6RVEpYxmA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110080 - LdsInitCVgprs: false - LdsNumBytes: 110080 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 4] - MIWaveTileA: 6 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 24 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 24 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 914 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 4 - ThreadTileA: 24 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16UY7JybfNYsXbB6ePwJELigTUPhCwSjLgdXAzMyu2h2o= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA3_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 915 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA3_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x192x64_MI16YPgxiAM3E1SkUV79IztVOV85qe_ULaWDpoyyCH7fevw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_3_MO40_NTn1_NTA0_NTB1_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 113920 - LdsInitCVgprs: false - LdsNumBytes: 113920 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [10, 3] - MIWaveTileA: 10 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 192 - MacroTileA: 160 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 120 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 20 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 20 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 916 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_3_MO40_NTn1_NTA0_NTB1_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 3 - ThreadTileA: 40 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Yb6vs2wMAek_0gQXVuxZgWD4oIrhpoUVVgQWjM-7NSU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA2_NTB1_NTC0_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 134144 - LdsInitCVgprs: false - LdsNumBytes: 134144 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 32256 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 101888 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 101888 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 14] - MIWaveTileA: 4 - MIWaveTileB: 14 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 917 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA2_NTB1_NTC0_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 14 - ThreadTileA: 16 - ThreadTileB: 14 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x128_MI16NvJYM5TYlxX33xhRC3rYojBFPcYDUmQTKy-xgGL2zfc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA5_NTB1_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 918 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA5_NTB1_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI32BUhgY5-OmEvd0k2y390LwC1WKTzoJmnEmMybvkNcvGg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB3_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 - LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 919 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB3_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT80x128x128_MI16CH6-2Ow9x-ZJRcCwf3Spct6EpgpY9UjXCKmQfd8TKYI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 2] - MIWaveTileA: 5 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 128 - MacroTileA: 80 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 5 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 920 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 2 - ThreadTileA: 20 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT48x32x256_MI16xNp06_mGTrXDLa_B2FY9_xiHpNe0EWyYQPcSwC-WZ3Qo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA6_NTB3_NTC1_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 107776 - LdsInitCVgprs: false - LdsNumBytes: 107776 - LdsNumElementsAlignedA: 25344 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25344 - LdsOffsetB_Blk: 90880 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25344 - LdsOffsetMetadata_Blk: 90880 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 32 - MacroTileA: 48 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 12 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 12 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 921 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA6_NTB3_NTC1_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16lvReLjX23li0zKLF0BjlXHUVB4dqsAX9tBGKFbFUMFU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA3_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 125440 - LdsInitCVgprs: false - LdsNumBytes: 125440 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 4] - MIWaveTileA: 12 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 24 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 24 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 922 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA3_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Q0iQTv2siYkhhsH27R4bwiDBwuYbcFbclooq76DRSmQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 923 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xVOcz9sQMnRhNkFWae_nQBMHW5856NLlj0IGASkeA7mU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 924 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xg9ZeL_CtEIKuLm_H3DZ3F4HiMb4AoPozk4YDq2Piw84= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC2_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 925 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC2_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x512_MI16xf8Ie6tqcBuD2Jujp65Vo-v0ikwjRIIuuqJfqvYMHnLI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 3 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 926 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x512_MI16xJmNUwCek8yAJmaYGDUGFMiPxUpHP50QvJWrzpjcN2QM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 927 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xumwbQLBsbTuiz-xmysx9cyRtIiz8u-ZdtZLjBu_bjSk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 17408 - LdsInitCVgprs: false - LdsNumBytes: 17408 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 928 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x16x256_MI16xctuAv9IQDoLZTgUSJDYmiBJF5KfirHnQYPm1jNLEZV4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 50688 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 50688 - LdsOffsetB_Blk: 116224 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 116224 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 16 - MacroTileA: 96 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 24 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 24 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 929 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x64_MI16Te1XcBfAkTj_Si2BZ_O928OQ0H5M89MFZVckX2T9Ct0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120832 - LdsInitCVgprs: false - LdsNumBytes: 120832 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 6] - MIWaveTileA: 6 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 192 - MacroTileA: 192 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 930 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 6 - ThreadTileA: 24 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xKU2-gNLRtPrqTDeoW6Ox1NxZrNH8LrYdSR1cZojayJQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA3_NTB4_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 1] - MIWaveTileA: 8 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 7 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 931 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA3_NTB4_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x384x64_MI16kw4DR3bscLfN9kE6MIrDpfvGwO5OsgS18hHb5_DgRlA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x384x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB7_NTC6_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 50688 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 84480 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 84480 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 384 - MacroTileA: 128 - MacroTileB: 384 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 12 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 12 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 932 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x384x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB7_NTC6_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x512x64_MI16Qzt3xLpEn8XX7WzJUeYcLb3pNOyn3wp2aHDtperEteo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 84480 - LdsInitCVgprs: false - LdsNumBytes: 84480 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 67584 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 147968 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 84480 - LdsOffsetMetadata_Blk: 147968 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 512 - MacroTileA: 128 - MacroTileB: 512 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 5 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 933 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT144x448x64_MI16JmldHSP3XTQNWsmOT-Ia1CQdNXcim5Rvy9qZPhyc0ZQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT144x448x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT9_7_MO40_NTn1_NTA2_NTB7_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 159744 - LdsInitCVgprs: false - LdsNumBytes: 159744 - LdsNumElementsAlignedA: 20736 - LdsNumElementsAlignedB: 59136 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 79872 - LdsOffsetB: 20736 - LdsOffsetB_Blk: 100608 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20736 - LdsOffsetMetadata_Blk: 100608 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [9, 7] - MIWaveTileA: 9 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 144 - MacroTile1: 448 - MacroTileA: 144 - MacroTileB: 448 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 252 - NumGlobalWriteVectorsPerThread: 252 - NumLoadsA: 18 - NumLoadsB: 14 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 18 - NumLoadsPerpendicularB: 14 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 934 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT144x448x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT9_7_MO40_NTn1_NTA2_NTB7_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 36 - ThreadTile1: 7 - ThreadTileA: 36 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI1647OpdIdLsPjkx5JGNkLWyaZgGxs4t4kA19t3ZHCMmwc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 - LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 25600 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 935 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16fQVQR5gcnScAmEP6ehXKtAM_-znIoCb6q9SZSpafuYI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 - LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 936 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16uPL4WiDVSfjrZh1Qou0vhKt44u-gQhgLc59de4dj6Ys= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB2_NTC6_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 937 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB2_NTC6_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16RHQsbTBHWgnm-nubTl7uKOe4m19tjMDivayCq2jENWg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 134144 - LdsInitCVgprs: false - LdsNumBytes: 134144 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 32256 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 101888 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 101888 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 14] - MIWaveTileA: 4 - MIWaveTileB: 14 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 938 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 14 - ThreadTileA: 16 - ThreadTileB: 14 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16nJXuNiaEvLyPIwe49PAbt3uaW6BX0758KZ4tlFZYNy0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 939 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI162HlKNw6yd-3bjBU1NA3fCbhfgdU76LtwrqYq5hCFoYE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 940 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16LLVRSYLO0ehe7plK3eqGuv5AKPIUn4lEFOgWK2IcQaI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_12_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110080 - LdsInitCVgprs: false - LdsNumBytes: 110080 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 83968 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 83968 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 12] - MIWaveTileA: 2 - MIWaveTileB: 12 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 941 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_12_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 12 - ThreadTileA: 8 - ThreadTileB: 12 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16dc1RWq5OI8ZYITl452_XtceKzZW8LbRsM-orvD1QNOk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB5_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 942 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB5_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16OPWJ2Vhbc174MGrC2nliEfLa8lucgLmzdcw90jlLzRM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA2_NTB7_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 122368 - LdsInitCVgprs: false - LdsNumBytes: 122368 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 10] - MIWaveTileA: 4 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 20 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 20 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 943 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA2_NTB7_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 10 - ThreadTileA: 16 - ThreadTileB: 10 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16x7VuXIvkVBdaZXVrBpv-ptK0ippKvTvEO8ANryNDeJlc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB2_NTC5_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 - LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 16 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 944 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB2_NTC5_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x96x128_MI16EwRs6OAL3-4d8o-I461H8f5l3jKJLNSfGEWNZTEh1Rk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 162816 - LdsInitCVgprs: false - LdsNumBytes: 162816 - LdsNumElementsAlignedA: 55296 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 81408 - LdsOffsetB: 55296 - LdsOffsetB_Blk: 136704 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 55296 - LdsOffsetMetadata_Blk: 136704 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [3, 6] - MIWaveTileA: 3 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 96 - MacroTileA: 192 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 72 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 12 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 12 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 945 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 6 - ThreadTileA: 12 - ThreadTileB: 6 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI32hKcHKQ8o49ASNY6zmjNreC_Q05A1UJzz-nd9LNa076Q= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 946 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI32uyddi-_Ova4MSaGan0jTebbhqveeajquDhTi1TkMuZk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA1_NTB3_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 134144 - LdsInitCVgprs: false - LdsNumBytes: 134144 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 32256 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 101888 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 101888 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 7] - MIWaveTileA: 2 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 112 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 947 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA1_NTB3_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM48_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x128_MI16fX1QLDzfCT3eH0e3dBdSNXN2QL17b_7okYOLyAb875k= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 3 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 948 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16qqehBsEZr-PXGp_8_xa2MYaLCdnLBRh2rL6ItyCL5Xg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA3_NTB5_NTC3_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67072 - LdsInitCVgprs: false - LdsNumBytes: 67072 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 32256 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 165888 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67072 - LdsOffsetMetadata_Blk: 165888 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 14] - MIWaveTileA: 4 - MIWaveTileB: 14 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 3 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 949 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA3_NTB5_NTC3_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 14 - ThreadTileA: 16 - ThreadTileB: 14 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI16CZh-cb17lQfltxiK-93MVQ0BdXoCi3XDe8XUoqTGd_Y= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA2_NTB2_NTC1_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 950 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA2_NTB2_NTC1_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16tIqhwvntux5qIxAZw2mfP-BUQi-NomTke_pd_eoxJcY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 100864 - LdsInitCVgprs: false - LdsNumBytes: 100864 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 83968 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 83968 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 8] - MIWaveTileA: 2 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 951 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Sr0lzEjPFUwdbm92eaiotYuMYY1y2wzlUPpDAwL44-o= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB5_NTC3_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67072 - LdsInitCVgprs: false - LdsNumBytes: 67072 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 32256 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 165888 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67072 - LdsOffsetMetadata_Blk: 165888 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 14] - MIWaveTileA: 4 - MIWaveTileB: 14 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 3 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 952 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB5_NTC3_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 14 - ThreadTileA: 16 - ThreadTileB: 14 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16AslD6pXW7JSF6dZ1qn1k9X58w5Ll3Av-KWUeReqv7ks= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB0_NTC6_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 104064 - LdsInitCVgprs: false - LdsNumBytes: 104064 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 16 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 953 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB0_NTC6_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI166Fyl5G2lYgtbZ9K5E4W-XtjRtkkUMRv8UUUTyMyvF8I= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC5_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 954 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC5_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16n0bQkRZ6qxeoGAdUu7GJQfmJ5xS0chekr0sv0pJVQ6o= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 955 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16hp2ricknUsKMxThnp_RJuqI1ZmNgVw5ACFCO95WQUN4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 956 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32alykEUyYVzzxaIZSLyvFAAE0grgJ-xPZkdbVRFEkzx0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB1_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51712 - LdsInitCVgprs: false - LdsNumBytes: 51712 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51712 - LdsOffsetMetadata_Blk: 98816 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 1] - MIWaveTileA: 8 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 957 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB1_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 128 - ThreadTile1: 1 - ThreadTileA: 128 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI32x3izqHgbM-d6Up_yu3myD76PCC4UCCwwHTLbQ9b8QsUu0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49792 - LdsInitCVgprs: false - LdsNumBytes: 49792 - LdsNumElementsAlignedA: 8320 - LdsNumElementsAlignedB: 8704 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8320 - LdsOffsetB_Blk: 41088 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8320 - LdsOffsetMetadata_Blk: 41088 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 958 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI32xpjCEuAtTyUsr6Zedfsbzg9qjaO-ZRBdlA5Jm10I4ZC8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 16640 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 959 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI32kZnI0mYofuEpwrNzqvlU5UpA03orT8bQpcZIcGeQYGc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 100352 - LdsInitCVgprs: false - LdsNumBytes: 100352 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 960 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI161IkOm794th9xtmf-mkwBuO231HubzKHTyLu7Tu4WjhE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA0_NTB1_NTC2_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 103552 - LdsInitCVgprs: false - LdsNumBytes: 103552 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 961 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA0_NTB1_NTC2_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1kAtWydlVVtA2mSO1ULYhEPtiBvOdTizaO9yfe5372HA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 962 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1d9s27EZuaxdvWTFGKd32hMyof8qLlVr_EIHGJTWt2gU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA3_NTB0_NTC0_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 159744 - LdsInitCVgprs: false - LdsNumBytes: 159744 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 79872 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 113664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 113664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 963 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA3_NTB0_NTC0_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI166fWKovV9hpGO4qaYPlsZA2HxVyg0NJO0rymRtakR8pY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB3_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 964 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB3_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI167pMAm5zbyGdER3KFEA27FXZCXlMviGTf4r_9_1FgSjc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB2_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 965 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB2_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI32NBLTHx_Gpoq8atMJtHhkIJxDKCjmWWFkjAdWqZEmCUQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 - LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 966 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI331VPimmorTQ8AYzGf1NiwfbagqGmyo8CLAfBEjaqiyM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA2_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 156672 - LdsInitCVgprs: false - LdsNumBytes: 156672 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 43520 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 78336 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 113152 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 113152 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 5] - MIWaveTileA: 1 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 8 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 967 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA2_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16p5RxYLrKEnbA-HIE4lUtKZzZTxM_IdhaDXFJXp6hszs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 103552 - LdsInitCVgprs: false - LdsNumBytes: 103552 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 968 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16ati3xugoO08PmQml2qZEayshFx1-5FgxjyXEFaYYolM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB1_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 969 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB1_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1rflT5zVO8AZ_i0y-GDDJCkoXv07gCcOniLc9DpvSd_4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 970 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1YXqSvjevjQLvpyt_2-23TZqpzBG81diWO1-fnvsF4p8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB7_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 971 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB7_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3u2tvE22ubVlGLexgnY5yHusorOdC_u5-aXgigV89i1c= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB5_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 4 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 972 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB5_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16mzQgXvkDqVGeE12hDFqEfYTgFxPQD8C0vOou3AMGIiE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA2_NTB7_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 973 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA2_NTB7_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16f748OZlQOumtIueQrxHYmlXNlyUnnD4XzYf2Bm5yU4k= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA2_NTB7_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 6 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 974 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA2_NTB7_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI169TqeNJ3f-dh_Q0utTQqpI7QgdML5AZVpV3NYhEbNG5A= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB4_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 10] - MIWaveTileA: 4 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 6 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 975 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB4_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 10 - ThreadTileA: 16 - ThreadTileB: 10 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x288x64_MI16gBjiLcqzSdUOIqOP-6Lctg5OStx9zfcjJsmowsuRFdc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA1_NTB6_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 129024 - LdsInitCVgprs: false - LdsNumBytes: 129024 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 9] - MIWaveTileA: 4 - MIWaveTileB: 9 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 288 - MacroTileA: 128 - MacroTileB: 288 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 4 - NumLoadsB: 9 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 9 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 976 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA1_NTB6_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 9 - ThreadTileA: 16 - ThreadTileB: 9 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16N7jXF8_Qf0aNxA7L42Afhh3P668h0pApiapyDg0ijN8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 104064 - LdsInitCVgprs: false - LdsNumBytes: 104064 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 16 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 977 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1gdbabCK4Cbk9dzcNz5MESH6Gr-pKALrY10oVBuGVbMQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 159744 - LdsInitCVgprs: false - LdsNumBytes: 159744 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 79872 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 113664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 113664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 978 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1A0XIwQXFu-OwTM6Pph_d6vN_vE-SazquoaubqUZ6nx4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 159744 - LdsInitCVgprs: false - LdsNumBytes: 159744 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 79872 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 113664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 113664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 979 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16ciosobcQH4ypKgT0_et38BzWL178BbBE8UBJCt-UCSc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA3_NTB7_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 10] - MIWaveTileA: 4 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 4 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 980 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA3_NTB7_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 10 - ThreadTileA: 16 - ThreadTileB: 10 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16EtL1yGW5yKYdxiu0Uq_Vhn2S_4vGR20I70CKzgpDFiI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB4_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 - LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 57856 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 10] - MIWaveTileA: 4 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 5 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 981 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB4_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 10 - ThreadTileA: 16 - ThreadTileB: 10 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16VHPGU0mrqXBy8pYIryFrS1YaLKj2KVsHrwljqR3OZG4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA3_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 10] - MIWaveTileA: 4 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 982 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA3_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 10 - ThreadTileA: 16 - ThreadTileB: 10 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x96x64_MI16xgH-uNasQNnFkPd_mYchLnJZB38K2VRlEuBoijC6t7hQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA1_NTB4_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 112000 - LdsInitCVgprs: false - LdsNumBytes: 112000 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 3] - MIWaveTileA: 8 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 96 - MacroTileA: 256 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 4 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 983 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA1_NTB4_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16C3wndprfgaJVK5dROkGkV3eLKOnVhaJmzYWboVPNjAk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 134144 - LdsInitCVgprs: false - LdsNumBytes: 134144 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 32256 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 101888 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 101888 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 14] - MIWaveTileA: 4 - MIWaveTileB: 14 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 984 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 14 - ThreadTileA: 16 - ThreadTileB: 14 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16IHgIlbUBAh-oojhpF9EcJ-8MdIl6f-EF1K5BOtbiZv8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA2_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 112000 - LdsInitCVgprs: false - LdsNumBytes: 112000 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 7] - MIWaveTileA: 4 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 224 - MacroTileA: 128 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 112 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 985 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA2_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 7 - ThreadTileA: 16 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16WEGE0_8tn9YpH6Yj31JWn7UWKY9tJMWNlzgHut1qGrs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 112512 - LdsInitCVgprs: false - LdsNumBytes: 112512 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 7] - MIWaveTileA: 4 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 224 - MacroTileA: 128 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 112 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 986 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 7 - ThreadTileA: 16 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16TOtr1qGMLBNQmRW-tQKRjnkljLO4s0f7ail8-SeQfNE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA3_NTB7_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 987 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA3_NTB7_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16fJg9VMUMnDvFlaadft4fFJBR-AZ1KOiQ3dSFORyhhu8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 988 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16ElsNwi58ewsT3E5tO-d6OS5JheuzUEoDPOeBnRLE0Sk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB5_NTC0_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 989 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB5_NTC0_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16_gkn1MbHNokE7gxDIgmnU285d2stkvVABZGxoxsmzsw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB6_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 - LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 990 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB6_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16krsgrbs75pydUq9sNfBGF5cwoLvZWAZSo8LLuYaH2g8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB7_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 - LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 991 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB7_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Z4mMoWpvvYfd28olgkYANQt7aA3dTeeCxqYS0kr75oE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA3_NTB5_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 992 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA3_NTB5_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI32nmGZ2oVA-8Jn23FdQrljhHKbRG58SGmQzbu9CyyS8NU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 4 - NonTemporalC: 5 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 993 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI1695c3DsOHZbdpyygb9R_j_oeZk16dsu5191x0V4IRQUI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 994 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16ACeXLyHEcgiJAoxHPQR1J10_cctGQw9JjXi0jAkoAlA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB5_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 995 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB5_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16I6pIOdQfdhrAh_Ct90BPYyv2Ys5r9nVOqCfHw4PmURE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 996 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI160Y1ykUqDNlgNI3WZEFLn1XS5x0MQd0hB7oTscWM7PYQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB5_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 997 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB5_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32I_Z0J9kV32gwjW-5P043f5A9XM-6LlLmeEeRzLC3LME= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB6_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 - LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 998 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB6_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32nFZoUwk3FHsU8Iz8ivk3xvSJGQDd0z6LBp721gWwwi4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB7_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 - LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 999 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB7_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16pVlEkTrKmft7dNlMWD0Ftam1zDhIu_5PcdHIezGiENE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1000 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16H7i9rgQAY-Hmn-PxHxFHj01qxvcZOQojUwk0rK5auCM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB7_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 5 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1001 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB7_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16At5zYKocGNc0tC84h2Ca8-f8RPzN85vXJiXN0gxmcHI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 2 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1002 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16svLLW3Wp5iCzbCDvMSBg--Q7EUrsgPyAVe_YYOHVtq8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB5_NTC3_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 3 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1003 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB5_NTC3_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16BqfWJb9IrydrMmIImYE7syAC7pjucPL3a3-FFE89DxI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1004 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16YT_go3YPrH3Z0c4g5-iW9-4P9cNreauSDieOyjiGT2o= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB7_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67072 - LdsInitCVgprs: false - LdsNumBytes: 67072 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 32256 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 165888 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67072 - LdsOffsetMetadata_Blk: 165888 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 14] - MIWaveTileA: 4 - MIWaveTileB: 14 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 7 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1005 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB7_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 14 - ThreadTileA: 16 - ThreadTileB: 14 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16EYc6YLLTfTALKcv_uSgC9T3VVBS6RDQti9l8AqlphA8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB5_NTC0_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1006 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB5_NTC0_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16cSpaXzVJf7-fhy2HNPfwWWcmfyqd6n6NBhQOfp3rQME= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB6_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 6 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1007 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB6_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16g2ho-xQD4baNle8vkD22b-jgLJQuXr4IH_t5S3ZpwXY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA1_NTB6_NTC3_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 16] - MIWaveTileA: 4 - MIWaveTileB: 16 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 3 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1008 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA1_NTB6_NTC3_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 16 - ThreadTileA: 16 - ThreadTileB: 16 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16s9bUjn0IcaFSiVUlrZ9EMKD1QVEVRfZVXI2lSdtb8pY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 4] - MIWaveTileA: 16 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1009 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI161-5EWi-_enLvAVYrVHpCQaPgqL9xF0xXjVsOCQ-oouo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB7_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 4] - MIWaveTileA: 16 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1010 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB7_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM2_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16Yg3CGvJoR66gSwBI8RhiSxLWsQ5RxFKb1LbZoBv6LyE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 3 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1011 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16VNB3XaOhh2-NR1XK6ZRAuPEEeUPSbQp9a8A9aTqcUwM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA1_NTB4_NTC1_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 16] - MIWaveTileA: 4 - MIWaveTileB: 16 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1012 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA1_NTB4_NTC1_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 16 - ThreadTileA: 16 - ThreadTileB: 16 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16SSiNEW16nYcsffeycDFQkvui5kpDWe-X_zn_bhPr9ig= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_12_MO40_NTn1_NTA2_NTB6_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60928 - LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 12] - MIWaveTileA: 4 - MIWaveTileB: 12 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1013 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_12_MO40_NTn1_NTA2_NTB6_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 12 - ThreadTileA: 16 - ThreadTileB: 12 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16mYzq1ZgGOir3WNse9Qh17J43TFE_95PscgKGYfll0hE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1014 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16dfcXFDA8SLUX-vrGy7rbc8Q4fMWlcRwN_wfQyscMJfU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB7_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 5 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1015 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB7_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16zYydvC1cxtIV2-tn0CB4-WHOsq99l20ygGB_zv7hJIo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB7_NTC1_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1016 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB7_NTC1_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16MW9U_2TPrZeYH5ppNd7JzZ-alNEiIAH_s3i0V-M_-3E= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB6_NTC3_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67072 - LdsInitCVgprs: false - LdsNumBytes: 67072 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 32256 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 165888 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67072 - LdsOffsetMetadata_Blk: 165888 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 14] - MIWaveTileA: 4 - MIWaveTileB: 14 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 3 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1017 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB6_NTC3_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 14 - ThreadTileA: 16 - ThreadTileB: 14 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16ns-fA46zepLUpH1jert-cTWxNUIJah040YS1-eGNBds= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1018 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16Rha03v4MhMNb6jRDsDBiP1h1rTlTKthb7HiWAG9Arb0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1019 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16Tdz_yQM1ZwK4kR7vHKsmLRZdrnCpmmXtZDr7tmUSq8A= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA4_NTB0_NTC7_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 104064 - LdsInitCVgprs: false - LdsNumBytes: 104064 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 16 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1020 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA4_NTB0_NTC7_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3dLSbUx9BEOTzqeT_H8ocDARlxEx7Dgff2ylxd52Ls_M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 136192 - LdsInitCVgprs: false - LdsNumBytes: 136192 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 68096 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1021 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 1 - ThreadTileA: 64 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16ZgNaKnH_H8ts65iOQcCcEIRBXR3CLNqivzkc2_gpqqA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB1_NTC0_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 104064 - LdsInitCVgprs: false - LdsNumBytes: 104064 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 16 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1022 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB1_NTC0_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI161MgVG7V2_jRTY_t_zmbX6GCxZrZN-OQf9f4PZ93N0FY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA2_NTB4_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 5 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1023 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA2_NTB4_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1a9La1t4bkGflOu9rs7esrQg1R2kdM8mOtN2cokKsxxM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB2_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 159744 - LdsInitCVgprs: false - LdsNumBytes: 159744 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 79872 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 113664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 113664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1024 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB2_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI165Ph5FzTBInU7wlUtI1XvBDV5Vo5H7CclCV448dKEBjk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA3_NTB3_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 6] - MIWaveTileA: 4 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1025 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA3_NTB3_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 6 - ThreadTileA: 16 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16nHFuOFllFsGK0KFewYZ4htEdbnVYLgfLbmrPe_PQ2wE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB3_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 10] - MIWaveTileA: 4 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 3 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1026 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB3_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 10 - ThreadTileA: 16 - ThreadTileB: 10 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16EFOjhm0P5myZD2ZUMYoJl8PTasXYyawjrjWCNZamLSk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA3_NTB0_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 6] - MIWaveTileA: 4 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1027 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA3_NTB0_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 6 - ThreadTileA: 16 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1tnbzwD61gJ8-eEJNYtMNbLQ5OG5mlyk0onu9l1wLgKs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA1_NTB1_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 159744 - LdsInitCVgprs: false - LdsNumBytes: 159744 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 79872 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 113664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 113664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1028 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA1_NTB1_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x288x64_MI16b-0hvop7nM8pniQgukyTovYnCjkagpUYftIy3lyOPEg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA2_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 129024 - LdsInitCVgprs: false - LdsNumBytes: 129024 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 9] - MIWaveTileA: 4 - MIWaveTileB: 9 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 288 - MacroTileA: 128 - MacroTileB: 288 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 4 - NumLoadsB: 9 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 9 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1029 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA2_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 9 - ThreadTileA: 16 - ThreadTileB: 9 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1LuOxONfMHSlq_UjUT9xBppI-AnllztoOTZImOhMcZQQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 159744 - LdsInitCVgprs: false - LdsNumBytes: 159744 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 79872 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 113664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 113664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1030 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16u2x9kOu0hKWVZugV0AqUr0nPgwFHsYz3EYunVxdL5Qc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA3_NTB3_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 10] - MIWaveTileA: 4 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1031 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA3_NTB3_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 10 - ThreadTileA: 16 - ThreadTileB: 10 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16wcWGExQ1bTDNkVseRcVXig4uUydDs-9d2wRriz03UdY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB1_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1032 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB1_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16NZL-75Ki7XAsNLaTZihU08lWBBhRtgHjnaUOLckBuQI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1033 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI16AyKb6vtbmqKKHAvrJyFKhPUETr4Ol8uWmDuvH1-1B3c= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1034 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16kDiDPfYefDawU7pB4UeOMRV40AcW7hmQKVxJkDN9xHg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB0_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1035 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB0_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI165tbp6mPoSU2BMK3wSMdm_VkVQsYaVhsiCr3oS3quvJg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1036 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI160kDl7yaU9gYvW2vW2lN_qXwOqy2Lr0Qto6vNjcx7UNA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA2_NTB1_NTC6_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 10] - MIWaveTileA: 4 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1037 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA2_NTB1_NTC6_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 10 - ThreadTileA: 16 - ThreadTileB: 10 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x256x64_MI16XoNfbiF6htIEwHHGVe6564vtK9vzaVwuBM8Vr_ZZsEg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA1_NTB2_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 21120 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 21120 - LdsOffsetB_Blk: 86656 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 21120 - LdsOffsetMetadata_Blk: 86656 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 8] - MIWaveTileA: 5 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 256 - MacroTileA: 160 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 160 - NumLoadsA: 5 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1038 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA1_NTB2_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 8 - ThreadTileA: 20 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x320x64_MI16Sg-N8yW_Z1bcX3qC7j2-LFytSJN-Mus_91wMBA5vIK8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x320x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 42240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 320 - MacroTileA: 128 - MacroTileB: 320 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1039 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x320x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16zEfK9g7EM94jFa32eAXtrbKTIojORHQ115zy-i3xxP8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB5_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 7 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1040 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB5_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16H9YoRVYXK40Eq2iR_4OzhYcMfwzyeTxbpmZETsE2fiU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB4_NTC7_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 7 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1041 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB4_NTC7_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI329SlG48-ea0-kImal4lo2_OisWj-a-6M71h-ef_F18f8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1042 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x320x64_MI16_qIxqjmEkQ0ngAkEKOzmMX3i6qKwWQILcBHveDPCXh4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x320x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB6_NTC5_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 42240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 320 - MacroTileA: 128 - MacroTileB: 320 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1043 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x320x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB6_NTC5_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16QK03hBnQ78u4OPWFbqStxrwJ2M-jC8ammqO3Ik2XybI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1044 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16_6rUGns54P4iRm9x7g6oqi5vQPI36h5UtnBDtmSe6sk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB1_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1045 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB1_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16CnaJJAsUKAIiiJ70KHC-E7xFWh61mD9O3Q_9gf-Qxho= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 4 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1046 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32lsdl6vHUJOLzJWQ0WkBk9t0H-hM_F69mI50m3O7qq1s= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 - LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1047 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16NualOvuZ9pPE0JQcqfMNqi8otcaKx69vcKn-FKPXHr0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA1_NTB1_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53248 - LdsInitCVgprs: false - LdsNumBytes: 53248 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 35840 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 53248 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 7] - MIWaveTileA: 4 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 224 - MacroTileA: 128 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 112 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1048 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA1_NTB1_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 7 - ThreadTileA: 16 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16RGn038FBreYZmQWbgae1UhiFktt_aCyc_OZlbzMDxPA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB2_NTC4_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1049 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB2_NTC4_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI32cIpEnTcE84fZ9PRiEIWKoRIX7eCuPz1mtyvvWg3c7h4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 134144 - LdsInitCVgprs: false - LdsNumBytes: 134144 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 32256 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 101888 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 101888 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 7] - MIWaveTileA: 2 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 112 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1050 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16HOObusf0o8NdAHXuyBZqLdkOGGwuJvOmTtBo1MNJUfM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB4_NTC4_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 4] - MIWaveTileA: 16 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1051 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB4_NTC4_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16UQbNvSEceN_LPCaQJTQJhZ53iF7mHC2TxEbC9JAyDt0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB7_NTC7_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67072 - LdsInitCVgprs: false - LdsNumBytes: 67072 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 32256 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 165888 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67072 - LdsOffsetMetadata_Blk: 165888 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 14] - MIWaveTileA: 4 - MIWaveTileB: 14 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1052 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB7_NTC7_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 14 - ThreadTileA: 16 - ThreadTileB: 14 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16ZhBJaC9kNuzX0DxuvPiBUyTs7czkHL2T6LAodogjRYA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1053 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16iWCfOsxx7jtqzHvFdOeuUoXxcQKvozyUXMPq0hRBXDc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB4_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 4] - MIWaveTileA: 16 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 5 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1054 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB4_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16AhdIgk54mOOWxYUqzAf8U4VvuDScAwy08q1ER8wpVlw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB7_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 4] - MIWaveTileA: 16 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 6 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1055 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB7_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16hihObv1T3WmVXc1uIYjLBbI4NSrEq-9PuS1oNbN0anE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1056 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI166XBnTeKzS7DngXrqKvV7A1jTQSRqAxsgnUZz0ZC8Zao= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1057 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI168a6vgjEtsTCl4Q0XdqE3Z5kva194GGS2SzXNm3r2lDU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA0_NTB1_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1058 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA0_NTB1_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16BBtPd9TAI-SMRvzVY79RSiFfiE4QRWzXKKDby2jqwPY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB7_NTC6_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1059 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB7_NTC6_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16xJEMM6LYN_e7vy_zaD7OGKbATqZz6KdhiH7HCam2VSA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1060 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16PEQAv9oY8qLEOVjBz5ssRS3aC7zKbq9XTXNSRQeir2w= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1061 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16Esi0nYSe8fXG-4BxVUxHKaH00tJb4onna0VjpYacw3M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1062 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI32g92QdEFf4ASBLkEsH7Rj7VWoIOAPO8etK13WZFJvB78= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA2_NTB0_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 134144 - LdsInitCVgprs: false - LdsNumBytes: 134144 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 32256 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 101888 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 101888 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 7] - MIWaveTileA: 2 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 112 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1063 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA2_NTB0_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16zsJEWcUKOyQX_UOIXMCVIagPvbo_-4ymbSHjswClW6k= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1064 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16b6MQoTrxALPfBX4ESIm92NC5jFvjc65EJc65MuEttrs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 5 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1065 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16TdfBACs6vIRfpCjyT5NiH1OqsVAptBCAQGMhqI1aFnE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1066 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16uFIvoH8xDmCn5AEIJq75hujEFC54w18bP1YCDLkk7yY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB4_NTC1_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1067 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB4_NTC1_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16jZqm1c-l_pGzsGAznryzAG7OFgWf1w_WabnvsRGS8Eo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB4_NTC2_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1068 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB4_NTC2_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16Q2AdOK2nfivw4jm-kfj0Y7kpDhfNv-eHWF7SbAyqMdw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB5_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 7 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1069 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB5_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16WO_NN55n0LZqDtG-_tiQnWxk6RZWoiA2W9I_K8OTt_0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB5_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1070 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB5_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM2_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16XWr7BISLqeiDp9croxxirvcAI7gaRXrmdV3Cu_FLdhY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1071 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI1695FkNHoDkjkfnLW2UzrBc6PvDDhFWtwrdew-CYY_QTY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1072 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI164GutLKCPh4OjvM7H_qT6XkIdkLDGdP3ftdLqyGFK_k4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1073 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16FAFIsm1iirmXv8ZP5wQgJTKxvUZLK-9T7gOlNRM6NtA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1074 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16K1f1is7knNvgs67ILLXNPZu6zVt7l0vnJtQTi4RuXYQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1075 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xdUUz19f8D8MI2nFIrQGpCuWZJFaM7Rd5Vs9SEMSWdgo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB2_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1076 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB2_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI13iKeSOb99hXcjX3CIHq3lBqvIRZskJ-O4olkIG0b1L0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1077 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI32-XT0Fb1H5FkJHG0p-F7E7Qil8yeb2hIr7_xWyvwW3Vs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB3_NTC7_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 - LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1078 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB3_NTC7_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1qJ4AGc7nfkYrKtLmZgjXBzS3UEN4pDuMCmYTo2zCxOs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA5_NTB3_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 159744 - LdsInitCVgprs: false - LdsNumBytes: 159744 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 79872 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 113664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 113664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1079 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA5_NTB3_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16SA-uhqgOr4Qh_kmIwcpTxdft7wtwadVjh8kqXncwQVc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1080 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x288x64_MI16i-6EXNthkeJDUJNmvbETLZtaN1U_svNblisIqN6BwSg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA3_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 129024 - LdsInitCVgprs: false - LdsNumBytes: 129024 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 9] - MIWaveTileA: 4 - MIWaveTileB: 9 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 288 - MacroTileA: 128 - MacroTileB: 288 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 4 - NumLoadsB: 9 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 9 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1081 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA3_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 9 - ThreadTileA: 16 - ThreadTileB: 9 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16Gt4L7W9lspFcka5wdZU0tkh4HDzGsjFwWgh-t_nB-JA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA6_NTB0_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 10] - MIWaveTileA: 4 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1082 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA6_NTB0_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 10 - ThreadTileA: 16 - ThreadTileB: 10 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI320KPGFal3u58tw6FPWrPvcLpz5orX1K_AW2hzLezkSqo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1083 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM4_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI167wQVo-YG41mG6WK19J3LCEjVD16RtxByf3rSRmg9fsk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_12_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 12] - MIWaveTileA: 4 - MIWaveTileB: 12 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1084 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_12_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 12 - ThreadTileA: 16 - ThreadTileB: 12 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x288x64_MI164FDAqPkKkvz8Iv-fvZ_6L0AAGrPLIGAMkSlBsAOeRiU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA2_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 - LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 9] - MIWaveTileA: 4 - MIWaveTileB: 9 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 288 - MacroTileA: 128 - MacroTileB: 288 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 4 - NumLoadsB: 9 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 9 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1085 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA2_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 9 - ThreadTileA: 16 - ThreadTileB: 9 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x256x64_MI16OqJGd14uQ52gRI1Rsi2UFdPxAm0swwNQHRLjTjfiTSI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 21120 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 21120 - LdsOffsetB_Blk: 86656 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 21120 - LdsOffsetMetadata_Blk: 86656 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 8] - MIWaveTileA: 5 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 256 - MacroTileA: 160 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 160 - NumLoadsA: 5 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1086 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 8 - ThreadTileA: 20 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16bvofrvKNNxN3cveXIgt6QtUZ0U7s3nS0eIiKeEb4xfQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB0_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1087 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB0_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16aZk9zeuXQ7r7-Vw6xPDTlAKzJlBebYV6m41NfLjjPYc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB5_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1088 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB5_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16fO1oASOze0Vg2zoh5hEn16uUUlBzEs4FD9alS1uM5mA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 4] - MIWaveTileA: 16 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1089 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16VJsYpVwb9QqJwaBn2Lit50HD8ymMpP4j6B0HYU4-oXQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA3_NTB0_NTC6_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 4] - MIWaveTileA: 16 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1090 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA3_NTB0_NTC6_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16LMwSxXj5zp3J9K8y9QEbSARIUTRup4D3jQD1UdgXFsI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 4] - MIWaveTileA: 16 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1091 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16bMZKzwvhzoixVP9zCi86UXG06PHHbcvgrOITFtU6sU4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB0_NTC5_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1092 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB0_NTC5_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16bL9wrzQn5yuJSeFskoVsiFz69S8zIX7w-5J5HCkIEJY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB1_NTC7_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1093 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB1_NTC7_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16_HwgB-5ytFjzfDU5AxTK1_7uAqjnRN6UbRkkfxeUqC4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB1_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1094 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB1_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16UPIGRlzE5I7x3ynQYJ-3nh8vEaG7Z582rxLtzwCT5D8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB2_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1095 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB2_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16zkh8X4QQrZTrsCN4WQ7GTYaOnlWMCun0aSaIEMcwQDk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB1_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1096 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB1_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16S6KV_wdPGr0pOqwvsXLdWjTvTzS4xPZ025IRHSW0ugE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1097 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI166mORZ_sQZm-feWyQ7nxv8YLKPNgHVlKW-cOQfRsEkW0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1098 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16noqae25_3KStTyIosT05Yu2Zfuq2kaD8xwGFvRu1JVg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 - LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 10] - MIWaveTileA: 4 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1099 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 10 - ThreadTileA: 16 - ThreadTileB: 10 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16C6rbpU94tjmEgJRAd6muxl4-WucqDzcWk2vOzzM1_tY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB0_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1100 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB0_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16dWLIk0a4GB9NXjspL5nt3eYrqi1uWCOpFwVujI_G-wE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA3_NTB1_NTC2_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67072 - LdsInitCVgprs: false - LdsNumBytes: 67072 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 32256 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 165888 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67072 - LdsOffsetMetadata_Blk: 165888 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 14] - MIWaveTileA: 4 - MIWaveTileB: 14 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1101 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA3_NTB1_NTC2_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 14 - ThreadTileA: 16 - ThreadTileB: 14 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16HUANMq2EUSkR8b7huy-UXY7HAdoJNx-I6gxGbL8WJek= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1102 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16KGEFctsEFTt8vPGbTqyWYKt4uOgo3ckf_waffwCvhfk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB0_NTC6_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1103 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB0_NTC6_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16zGkhbTybq3q7R8tqcH_041xzWUOs63S0Vfv5vdHNG0E= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1104 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16fO1Vl6_RK2pneSEmSiH6889f7udGqZF5en1jPjE7f5s= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1105 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16DolEhU6B8peQS3Xr6gLMSHrNYDRRdtZG1_lDoxfVxw4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1106 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16DqXLmuBHw91--EFJGqGve91ZKrAkVU5Dgdf4oFu5USc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB0_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1107 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB0_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16WCvet_bQ3nd_VHY5vJFI3xOqyx41kdfAbgxo8VKJSA0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA2_NTB2_NTC4_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 16] - MIWaveTileA: 4 - MIWaveTileB: 16 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1108 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA2_NTB2_NTC4_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 16 - ThreadTileA: 16 - ThreadTileB: 16 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16-VOP_WA2iIQOjjdqGpuPz5kr9KYtc-lRi-KptFlDH_U= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB1_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1109 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB1_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16nT5mD0zgHNuSuNO3LS4pIiNXb5nn8h4D7ahAEPo_gk8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1110 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16CA7Yb3Smt0lByzCMKtYT8mQpKMYNjHDTKRxOeHnvET4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1111 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16LTAnBB7dnOp3mFrtckU-sVhJHXdp4-pBAGhzx-rgxUI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1112 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI167SQROjbjyojkOid5V1kDdg0N5EBU1mGMl_BEBvSu4zs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1113 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16M8EcAH0KKwfGMOFiO9vHhf8fw2_F5Q_UiRQsEOLIhPQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB1_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1114 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB1_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16rmzN-rQbzDadClNM6Z-qm0R3b12L4e3hK6iGCsTuU1s= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1115 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16lz06vbyMMB-5-9k9BDFIPd3DEjSJ7iIbbmHAva1GmqM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1116 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16GWskxaMjYwjKTmkaq514txv9pulvSv_erT_VgSDE83c= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1117 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16tRFlWvKSF04gQ9PZokgQUcOKDhiq0TfToeDxswShft8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1118 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16Sx6-NSD-VB2hLGWLmrGjCnG5l_q7cf0-XzZBce8HOBM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 - LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1119 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Ybv8p_jVFypl9sWQQFDDi1cPn8IzH4pi2AefDIhZ-G8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB3_NTC5_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1120 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB3_NTC5_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16LE-teoIckV9SwtN-HF7EEfMn0L4AQgAt_pdBX1jXvj4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1121 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16MgW3z512SZg0bOgviAw73jy_ZBOr8_SGK3gOhEG8QYw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC4_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1122 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC4_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16R_XGnIyKwD_ggRKOz3uhrre8cvOR_2HFLhMFeHl_S2c= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1123 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16E4BRmOTUaJeUyAaeCyhjNIjh6GpBB5szm02giyo0mxo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1124 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16ssYgjsPX3NIGr1xmyTwHB7XVv5sKgD0tDoOkgvN53SU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1125 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16MuVB9bB0zS45imskyXLOsB2x4bbsRUZI5PVWVKQwcY8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC4_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1126 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC4_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16OSkCnaTZhKg4ll1hDwDoJBMt5EdUQkVbuEM1IeK16cU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 4] - MIWaveTileA: 16 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1127 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16J7iDAyLmqVVIehsVSoYUv5fOo-XqS_59meivI4B4Kkc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1128 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16_NOb4H_Rw0hQCSPdl88-jl2JhI09qpVznCp_OeT5aAM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1129 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16bDp3iRHblovZwgU5K1M-8dA0RKxoDsyjHc1s5aDWaro= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC7_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1130 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC7_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16xUZcE0hE2Bib1NNuylLNWifCQmoKaQqEw2uEAoVv1_E= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1131 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xFmUG21QAMH0kUGV4BTe2JF_fY92putQfeCuWGZI7mbU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA2_NTB1_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 - LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1132 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA2_NTB1_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xicdzBXpHaJqwqfaUVeyNltBgTlxJarQSuj2alvAGbVg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA5_NTB0_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1133 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA5_NTB0_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x128_MI16x9ijl6pmXU4AiAeoG_II5-NaljFlVEWMycwB2vm_x9AU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1134 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16njp7CPA9beWp6S-w4r1lrpxYwCoEi-BmazO82Q-QAa0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1135 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x128_MI16xjbWgUiB8fBDD3qF886IbpXQNNUuYQaunGlFUAcRl9A8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB2_NTC1_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1136 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB2_NTC1_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16oZJsBklcXmO9_EkFw0q2QxqKalKjsNqqSu7fqQgWQKE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA4_NTB1_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1137 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA4_NTB1_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16vdv5_0fbgLcwQ9ftVdstSpW297KX0NAT55Up98EJAPk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1138 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16wW2GdYbBAi4mXKt-RSmebm4FOMJ2nnnfaqgn_vLsIPI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC5_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1139 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC5_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16AOWOx76QxjeIBmUHaDWCAlzYn-r3NYlZn2Aymw1nadE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1140 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16HCQocKtpDSk1PmC_MhDWpSrG3Bl8gSHYlLzp3Rvv5xs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1141 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16J-b4eg25Bv93NUcDs4Vr6UOcuDThqY3_tV7ZVspNCcU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 4] - MIWaveTileA: 16 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1142 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x192x64_MI168YC1E6XlJcpiWqvuj2SFey4Y34gcRhSdKxrluzoR1bk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA1_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 73728 - LdsInitCVgprs: false - LdsNumBytes: 73728 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 177152 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 73728 - LdsOffsetMetadata_Blk: 177152 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [10, 6] - MIWaveTileA: 10 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 192 - MacroTileA: 320 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 120 - NumLoadsA: 10 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1143 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA1_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 6 - ThreadTileA: 40 - ThreadTileB: 6 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x192x64_MI168YC1E6XlJcpiWqvuj2SFey4Y34gcRhSdKxrluzoR1bk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA1_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 73728 - LdsInitCVgprs: false - LdsNumBytes: 73728 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 177152 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 73728 - LdsOffsetMetadata_Blk: 177152 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [10, 6] - MIWaveTileA: 10 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 192 - MacroTileA: 320 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 120 - NumLoadsA: 10 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1144 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA1_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 6 - ThreadTileA: 40 - ThreadTileB: 6 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16eDZsMfNwnfVMVs6i8i9BJmt4qLqSm7t3KWlOXVLZYDQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1145 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16IeIPnD0ASsH304iQErCsLeHn4UAt9Zvuggp2bi_iYtQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1146 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16CAdVKf_H9WahvROBJTW2fomY7fLU33Z2lZiHbQin9fA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1147 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16IVcnlac8U14ytqCcaTbrvhpCbfpFud3o2PBOZbeKFoQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1148 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16JBtQL-gXWn1uRsQ2oat1oL2PA04O9DaSF-Ii9Q-0ZUo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1149 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16d2N6DogEQ6hyfXHa_mSAsHyviMx3EZJLgQaxRgHFzts= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1150 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16DfVHvnACULGH37Sa_XE-P7tLcdtlwiwnOAe31qbXFmw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1151 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x512_MI16x1v-3zJDSL2tojKmJzevwA9SB1VS8zFaRMW3BaP-Tt8E= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 - LDSTrInst: false - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1152 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM0_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x64x128_MI16xYeWN9tZpQCW9DZrfqqTK_y5abbBuQacoY_6asOJCEx0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1153 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 2 - ThreadTileA: 12 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16yBdWkeYWY_eptRtD922RgiDQ8Dq25ci2fyY-P4ql8kg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1154 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16Ms0dd4RHIG_WMoYIl5-1AwK1cEtHh_9-2asMn2GegQA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1155 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16RmL8CMKs3abmUopF8c8Pj84PklcDPZ3YJimZqAQaTpo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA2_NTB3_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 16] - MIWaveTileA: 4 - MIWaveTileB: 16 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1156 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA2_NTB3_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 16 - ThreadTileA: 16 - ThreadTileB: 16 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x160x64_MI16rOVTex7lBbkCoCRJ6Kol_Pt1HKz3iQ9QWoZyuKqlAAY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB3_NTC1_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1157 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB3_NTC1_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT320x192x64_MI16JhWz8HR4Ozx_QFsCymLdPJkp0u7NGVqR0BvhUPY86Ds= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 147456 - LdsInitCVgprs: false - LdsNumBytes: 147456 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 73728 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 119808 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 119808 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [10, 6] - MIWaveTileA: 10 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 192 - MacroTileA: 320 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 120 - NumLoadsA: 40 - NumLoadsB: 24 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 40 - NumLoadsPerpendicularB: 24 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1158 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 6 - ThreadTileA: 40 - ThreadTileB: 6 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16pWnMbqXR46MGZmtqGnuyZWyBgunOkRzNPXiN9mb_Qp0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 73728 - LdsInitCVgprs: false - LdsNumBytes: 73728 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 158720 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 73728 - LdsOffsetMetadata_Blk: 158720 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 10] - MIWaveTileA: 6 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 320 - MacroTileA: 192 - MacroTileB: 320 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 120 - NumLoadsA: 6 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1159 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 10 - ThreadTileA: 24 - ThreadTileB: 10 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16iUeGTsZpmRqKoDjPGylLXyFkLFuIbSmKPrCHfEfcz0I= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB0_NTC7_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1160 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB0_NTC7_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16tTKs9Bobo4896Rn6pyEwLSJG7bw7MA6urE3_e30cug4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1161 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x144x128_MI16kwElsBWslWZFA4inQsiQvPTy8axcvY0uW8140a_FMGI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x144x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_9_MO40_NTn1_NTA0_NTB6_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 125440 - LdsInitCVgprs: false - LdsNumBytes: 125440 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 41472 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 83968 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 83968 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 9] - MIWaveTileA: 1 - MIWaveTileB: 9 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 144 - MacroTileA: 64 - MacroTileB: 144 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 4 - NumLoadsB: 9 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 9 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1162 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x144x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_9_MO40_NTn1_NTA0_NTB6_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 9 - ThreadTileA: 4 - ThreadTileB: 9 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT384x160x64_MI16AoZiuikCjAtI7o0V9fGYRI8_9mMMWsMRFH275KpZEi4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB2_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 146688 - LdsInitCVgprs: false - LdsNumBytes: 146688 - LdsNumElementsAlignedA: 52224 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 73344 - LdsOffsetB: 52224 - LdsOffsetB_Blk: 125568 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 52224 - LdsOffsetMetadata_Blk: 125568 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [12, 5] - MIWaveTileA: 12 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 384 - MacroTile1: 160 - MacroTileA: 384 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 48 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 48 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1163 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB2_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 5 - ThreadTileA: 48 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16Y8TxBBS9bifMM7IxNsFmenMisrZ33tZ5QbFR54_-kMA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1164 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16LCG604UGZOHZzw2NaKdk51GP6MMxsvBUNZDLM5_GJ40= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB0_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1165 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB0_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x256_MI16xS155ys4mzM6BQj6FfprjynrI5lFJPW0s0gNUZTurBzk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB7_NTC3_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 - LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1166 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB7_NTC3_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16bfHqywFO81hFhO4oluv3mQgUYD6C3wMstY5HzTDY_B4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1167 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16_sSxkSgDYcBmIClOr8spTw722SRcXTJqaExxhWVHkls= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA3_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1168 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA3_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16ZVEcpRilhnVvMOGPEyf_fUl1b1f9WHSxv86xpI7S7Rs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1169 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16-tYhbCxlfbhtzTOPHJhzchd4BVNV8Dc6A6QDANLveMI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1170 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT384x160x64_MI16uoloKMx6I4MRgCUee7moPz0hDCd9mcaOGKOEaQdxSoM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA3_NTB3_NTC7_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 146688 - LdsInitCVgprs: false - LdsNumBytes: 146688 - LdsNumElementsAlignedA: 52224 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 73344 - LdsOffsetB: 52224 - LdsOffsetB_Blk: 125568 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 52224 - LdsOffsetMetadata_Blk: 125568 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [12, 5] - MIWaveTileA: 12 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 384 - MacroTile1: 160 - MacroTileA: 384 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 48 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 48 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1171 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA3_NTB3_NTC7_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 5 - ThreadTileA: 48 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x128x64_MI32Wtc1yp6Ta3Pua26TOMtOY3BI-_7idMgKAt4367XhbLk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x128x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB2_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 129024 - LdsInitCVgprs: false - LdsNumBytes: 129024 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 111616 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 111616 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 2] - MIWaveTileA: 5 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 128 - MacroTileA: 320 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 160 - NumLoadsA: 10 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1172 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x128x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB2_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 2 - ThreadTileA: 80 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x256_MI16x-pudhQ7fGawuUXdg64U7yO42tTWWyNIUjdps9hDrwEY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA5_NTB3_NTC5_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 136192 - LdsInitCVgprs: false - LdsNumBytes: 136192 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 68096 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1173 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA5_NTB3_NTC5_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x256_MI16xtt07358_0YVbgycjXjUwRmkDDH_2hGoOdccC0QMrTok= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1174 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x256_MI16x8wxC-6tg13jz0ohIoI0V3uu5KuQ5guTMvbtNKjKx7n4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC2_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1175 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC2_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x256_MI16xjLZGvMcn-2XSfwKS5fpF6auy3gUA2UC9lLk6n6pVAGk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA7_NTB2_NTC2_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1176 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA7_NTB2_NTC2_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x16x256_MI1640zrVku2d-o_pKaiM9ZuvyIrrHuvMlJlRNXC5wSoEGM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 2 - LVCA: 32 - LVCB: 128 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 152576 - LdsInitCVgprs: false - LdsNumBytes: 152576 - LdsNumElementsAlignedA: 67584 - LdsNumElementsAlignedB: 8704 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 76288 - LdsOffsetB: 67584 - LdsOffsetB_Blk: 143872 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 143872 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1177 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC16_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x16x64_MI16xiPimPr3JiV3nbFC6U_GRfFiP7RJIaLZ98adTdjtmZGI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA4_NTB2_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 140288 - LdsInitCVgprs: false - LdsNumBytes: 140288 - LdsNumElementsAlignedA: 67584 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 70144 - LdsOffsetB: 67584 - LdsOffsetB_Blk: 137728 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 137728 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [8, 1] - MIWaveTileA: 8 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 512 - MacroTile1: 16 - MacroTileA: 512 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1178 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA4_NTB2_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x64x64_MI16xzFmIialZJsNxItzr-zDgqFXT8i58bv2y0xoQaYdYieE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_2_MO40_NTn1_NTA7_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 107776 - LdsInitCVgprs: false - LdsNumBytes: 107776 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 64 - MacroTileA: 256 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1179 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_2_MO40_NTn1_NTA7_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16ofycSbuK0LYxPwEekhAMKIySnqQvNtHf2PVhYPHxNvE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA7_NTB3_NTC0_NTD7_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1180 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA7_NTB3_NTC0_NTD7_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x32x64_MI16xGnajFGSc5v9ghEzUrtjydyVsFac202K1eboUzLYoOSE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA6_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 103552 - LdsInitCVgprs: false - LdsNumBytes: 103552 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 2 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1181 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA6_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x128x64_MI16yaHHigmnL0zCRVkhyrhCzJpRof5vB7YpVpjVCZGmwvM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA6_NTB3_NTC1_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 84480 - LdsInitCVgprs: false - LdsNumBytes: 84480 - LdsNumElementsAlignedA: 67584 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 67584 - LdsOffsetB_Blk: 198656 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 84480 - LdsOffsetMetadata_Blk: 198656 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 512 - MacroTile1: 128 - MacroTileA: 512 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1182 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA6_NTB3_NTC1_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16Ur1U9ffXVZ7Ka1EXunwGDeUcDnI34yHDXxxeoAdl07o= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA5_NTB5_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1183 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA5_NTB5_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x128x64_MI16V3PL-B3Ing9LzaWSXrtfBGIK2o1DGNuMPKZg-VT7jX0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB4_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 84480 - LdsInitCVgprs: false - LdsNumBytes: 84480 - LdsNumElementsAlignedA: 67584 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 67584 - LdsOffsetB_Blk: 198656 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 84480 - LdsOffsetMetadata_Blk: 198656 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 512 - MacroTile1: 128 - MacroTileA: 512 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1184 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB4_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16K7EkM19lOgktmwEAw6FVkh7gDmk2zkq4B3cY0qt7EXE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA6_NTB2_NTC2_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1185 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA6_NTB2_NTC2_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16AHyARlXgTjdAI23gNW4dApx6DMJaSfJpM-0cMOXqgLM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA5_NTB2_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1186 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA5_NTB2_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16tx-4ufoPB35PKPzGeDu5RrFF5vOp90Mym77Kr7yJUzU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA4_NTB3_NTC2_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1187 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA4_NTB3_NTC2_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3TDJTmiFLUEgX0tXG202WiRF6oFxe_0TT4eom7r9_0Ac= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB2_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1188 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB2_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16R-ATXz9Yd6RvRd5kjy1WGtuKGx-vB0uyhTw_Cz4fROc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_6_MO40_NTn1_NTA6_NTB3_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 6] - MIWaveTileA: 2 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1189 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_6_MO40_NTn1_NTA6_NTB3_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16OrDSdF2N5bMx7BZEKVkeNcXMFOQVnuE1GYd4kFi4p4k= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 2] - MIWaveTileA: 6 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1190 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 2 - ThreadTileA: 24 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16vvzUUIRVm-_WNDYPm_mY7aT3cTQOuxE8xSK3D49GB7M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB2_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1191 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB2_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16ZMmuRq8okPyiGVg9Uc5STuyJssxmYDfpDGCFMgo_u6I= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA0_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 3] - MIWaveTileA: 16 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1192 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA0_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM2_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16dgJxlSqa3PH3ZHKHY2WFZY8G9Uvukp0pn9K9jmhDu_4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_12_MO40_NTn1_NTA2_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 12] - MIWaveTileA: 4 - MIWaveTileB: 12 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1193 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_12_MO40_NTn1_NTA2_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 12 - ThreadTileA: 16 - ThreadTileB: 12 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x192x64_MI160KvHYc14waQOBkL2zi9jZwF3icdFceVrVPedMshh6n4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 - LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 35840 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 35840 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [7, 6] - MIWaveTileA: 7 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 192 - MacroTileA: 224 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 168 - NumGlobalWriteVectorsPerThread: 168 - NumLoadsA: 7 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1194 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 6 - ThreadTileA: 28 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x192x64_MI165sx0jrMP2bDu9fyIgBFfOxD4DnHxq_GElkCuRlKNwGM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 73728 - LdsInitCVgprs: false - LdsNumBytes: 73728 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 177152 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 73728 - LdsOffsetMetadata_Blk: 177152 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [10, 6] - MIWaveTileA: 10 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 192 - MacroTileA: 320 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 120 - NumLoadsA: 10 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1195 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 6 - ThreadTileA: 40 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16kgKz9eDdoAIPzBKjLODCrBuTAfnKwHJZANsd6NDwx_A= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1196 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x192x64_MI16X48J9Ltjr6Fni-YKxBouUUC2mYFqLM8XpLOPd-sDHe8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA3_NTB0_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 147456 - LdsInitCVgprs: false - LdsNumBytes: 147456 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 73728 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 119808 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 119808 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [10, 6] - MIWaveTileA: 10 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 192 - MacroTileA: 320 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 120 - NumLoadsA: 40 - NumLoadsB: 24 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 40 - NumLoadsPerpendicularB: 24 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1197 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA3_NTB0_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 6 - ThreadTileA: 40 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x192x64_MI16SYrS5uSE4BZwd5zCtirVCnEaiomc9Gwfpg1irfDnwjI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 147456 - LdsInitCVgprs: false - LdsNumBytes: 147456 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 73728 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 119808 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 119808 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [10, 6] - MIWaveTileA: 10 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 192 - MacroTileA: 320 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 120 - NumLoadsA: 40 - NumLoadsB: 24 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 40 - NumLoadsPerpendicularB: 24 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1198 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 6 - ThreadTileA: 40 - ThreadTileB: 6 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI168TPQoax6F4UkmsaaU3KdrUIni5DMVXBrW9IC0RhNmv0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1199 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16CTefB-e7cCg4ntBxNlMXk5tK97DQ4ISlEZQ0v1ZQdP8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA6_NTB0_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 107776 - LdsInitCVgprs: false - LdsNumBytes: 107776 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 3] - MIWaveTileA: 8 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1200 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA6_NTB0_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16TSGusewBQQP1TvscvqWbAy6gVUSgIEKyX_h0ovl0PD4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1201 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x192x64_MI162KXdL087ltM0nHmSSdKqcCJ_I5ZgMGqklXJFgkMJV58= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB3_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 147456 - LdsInitCVgprs: false - LdsNumBytes: 147456 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 73728 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 119808 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 119808 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [10, 6] - MIWaveTileA: 10 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 192 - MacroTileA: 320 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 120 - NumLoadsA: 40 - NumLoadsB: 24 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 40 - NumLoadsPerpendicularB: 24 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1202 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB3_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 6 - ThreadTileA: 40 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xaZOyY1JvgeZrlYRLln7CdOx3Ey8o_7PdzCBRYeyIqI0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB5_NTC7_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 - LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1203 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB5_NTC7_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI16nWElDIMquA3ot6AV3fbrgjigAA3FZGb6UR_F8ui9JDg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB4_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 - LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 57856 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1204 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB4_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16xRp_h3XNfKJgKuXa390hR_a_PA_ozB8s58WbDDA8vco= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1205 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT384x160x64_MI16EU1PEmRnXOvL0QHfLfUoevgFsfi_zte_dZX-qyVqCTs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 146688 - LdsInitCVgprs: false - LdsNumBytes: 146688 - LdsNumElementsAlignedA: 52224 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 73344 - LdsOffsetB: 52224 - LdsOffsetB_Blk: 125568 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 52224 - LdsOffsetMetadata_Blk: 125568 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [12, 5] - MIWaveTileA: 12 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 384 - MacroTile1: 160 - MacroTileA: 384 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 48 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 48 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1206 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 5 - ThreadTileA: 48 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16AMWL3EVBr59_lpeIRjHZ5OTH_ogtoF3ZIEa5R4OAZ1A= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB1_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1207 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB1_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16wXkV5TwwY5Cn1b-JpZBDfovmW55D-0rmv6LNF7tIHLk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1208 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16ykvPBnHYAGU2gzFcLmZFD7tn0-B_zHzocTEkJxoRSzM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1209 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16I-I_p9PW9YTq4muXWI4l4cM_6YWQsOFlAeiPYcYnBbg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1210 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x512x64_MI32hb5tzjXwCzldRFZdXoQyPxBPkv6tY4c23Yw4cMCM6uc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x64_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 84480 - LdsInitCVgprs: false - LdsNumBytes: 84480 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 67584 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 147968 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 84480 - LdsOffsetMetadata_Blk: 147968 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 512 - MacroTileA: 128 - MacroTileB: 512 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1211 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x64_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x96x64_MI16x8ifhaVedsGvXtVvssmJeGqcuP6lzVwXQQU3EBLvx1ik= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA2_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 - LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 3] - MIWaveTileA: 6 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 96 - MacroTileA: 192 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 72 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1212 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA2_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 3 - ThreadTileA: 24 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x512x64_MI32fDHzmK6xXv9KWD_zOxf86BgGGKPdBqZvbIC9H9_dJPo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x64_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 84480 - LdsInitCVgprs: false - LdsNumBytes: 84480 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 67584 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 147968 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 84480 - LdsOffsetMetadata_Blk: 147968 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 512 - MacroTileA: 128 - MacroTileB: 512 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1213 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x64_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xofYKRGZZFMipcebwqyHJvXrkAsHMwyx3fDtvPyIP_3o= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 - LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1214 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1rzg0c0k-G552AVJtHvjXC_hzf6O8g9UuJzfRdWbtvJk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB5_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 134144 - LdsInitCVgprs: false - LdsNumBytes: 134144 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67072 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1215 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB5_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI11rtTFRPNt4P5Va9APcSMLjHPb84VZ_C5_J1XxdlxNLg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB6_NTC0_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 0 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1216 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB6_NTC0_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x112x64_MI16oOsVY6D-NipSJxy10I5Rw5jRX6kT2cwwtrRzLOz54fk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x112x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB3_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 52736 - LdsInitCVgprs: false - LdsNumBytes: 52736 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 17920 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 52736 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 7] - MIWaveTileA: 4 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 112 - MacroTileA: 256 - MacroTileB: 112 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 112 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1217 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x112x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB3_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 7 - ThreadTileA: 16 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16x5nGlymsVtuy0UhY3byt5q7qpRM_MuGfMJE_XfThop3A= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 50176 - LdsInitCVgprs: false - LdsNumBytes: 50176 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1218 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM0_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xLRdvQHXKSCos5Tu-D11kfmXmkI9jSpB1lGsPvG2GoAI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC1_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1219 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC1_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x48x256_MI16xWMH4TKhQRr2HJbKo9Afjor4Fu0shSgIudw1EsHpYQt8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA1_NTB1_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 48 - MacroTileA: 64 - MacroTileB: 48 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1220 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA1_NTB1_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x48x256_MI16xIiqpyrq_uqEeJj6BiF3ZmZrgcsqh5b0y013DTgUVW9c= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA2_NTB0_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 48 - MacroTileA: 64 - MacroTileB: 48 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1221 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA2_NTB0_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x96x128_MI16xP1L5z9WZqjHH7WZDMoE5_HmjExSEsFkiXWZJNTy9ac= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA1_NTB5_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 162816 - LdsInitCVgprs: false - LdsNumBytes: 162816 - LdsNumElementsAlignedA: 55296 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 81408 - LdsOffsetB: 55296 - LdsOffsetB_Blk: 136704 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 55296 - LdsOffsetMetadata_Blk: 136704 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [3, 6] - MIWaveTileA: 3 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 96 - MacroTileA: 192 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 72 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 12 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 12 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1222 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA1_NTB5_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 6 - ThreadTileA: 12 - ThreadTileB: 6 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xyV07fwu9mTddfa7GWyFIPasGkXOBxRkCVDopz1XMCLY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1223 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xqWBuRuZWKZqWHHvNoJYjVGO_ekpT4MJGqXZdUhRH7rk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC2_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 - LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1224 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC2_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM0_WGMXCC16_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16CH1VcmCQmierqW5AI3bjXgohKh5G7hJmekg943ulY18= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA3_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 4] - MIWaveTileA: 16 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1225 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA3_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x96x128_MI16x5j7S8DsiiOwJFsOXYUlJ1sagQRIBIIxRZnRkoen9XFY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 102400 - LdsInitCVgprs: false - LdsNumBytes: 102400 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 74752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 74752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 96 - MacroTileA: 32 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1226 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Vji1j4OxfsLqG_xzGoVP-y3iYAloZaNl0PB8lNHJwcc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB0_NTC7_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1227 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB0_NTC7_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1nMK6AfLdIroY7RvkxdBHqku3JZRVsx_AwMoWOzZPuEE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA1_NTB1_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 54656 - LdsInitCVgprs: false - LdsNumBytes: 54656 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1228 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA1_NTB1_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16xmnyarJco_A8IfBxdNGDCY0IdmOXcznvmDyuAKXCarnw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 - LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1229 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x160x64_MI16xJ4zCv-v4yPVlnVRuvGsRDc3yy_OXzgPhOxGqze9I82o= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA2_NTB3_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 100352 - LdsInitCVgprs: false - LdsNumBytes: 100352 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 25600 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 74752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 74752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 160 - MacroTileA: 64 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 2 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1230 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA2_NTB3_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI163BB2OdtcDRq0YTxjzncmz9jNVN2ieeiaJV92h-h8bkk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 77312 - LdsInitCVgprs: false - LdsNumBytes: 77312 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 51200 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 157184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 77312 - LdsOffsetMetadata_Blk: 157184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 5] - MIWaveTileA: 12 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 320 - MacroTileA: 192 - MacroTileB: 320 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 6 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1231 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 5 - ThreadTileA: 48 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI32pa90HkBFZszgmyk8cSWj7VJh6x6NoAmXyTa1-wCPPAw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB2_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1232 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB2_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM48_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16VAoYLWEAzwFDxDd54BeMVgztslaKoupk2qXOkZfcy6M= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1233 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16m67iHGRkyjsua8Y3ZAKWIQ3lrNKmV0GybKAIHGx651s= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_6_MO40_NTn1_NTA1_NTB3_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 6] - MIWaveTileA: 2 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1234 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_6_MO40_NTn1_NTA1_NTB3_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16Sc9ZnmNmQyJZzXk5_bTgKnXoMLctw_GCr4c6imMZKAU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1235 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16XNYwhkB1i3h3py0aWfqcrtZp3o_uju369IqAuDh0D9Y= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [16, 4] - MIWaveTileA: 16 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 1 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1236 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16x4DHRGJR75pOZw4c5p5rnEZpn666-eMOFPEiWzJf16cs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB1_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62336 - LdsInitCVgprs: false - LdsNumBytes: 62336 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1237 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB1_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16i8t-GgU5Q80x4TX7poc8O8Z6fliYd9KW4Fk6McpaUsg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB5_NTC2_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 2 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1238 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB5_NTC2_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16HeJ3HkPhXArbTaCzCEhModAMu1qfY2zEsIiaUPHpOjo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 103552 - LdsInitCVgprs: false - LdsNumBytes: 103552 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1239 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16zxF5V4DipeTyDKU2HauhCwa4opcsH-3KSbbqpAojhUY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB2_NTC5_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 107776 - LdsInitCVgprs: false - LdsNumBytes: 107776 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 25344 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 3] - MIWaveTileA: 8 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 16 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1240 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB2_NTC5_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16T8bIoEQ8w83-veNnIiq2cRL-zF5zcd9j_KxG5-gph6w= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1241 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16y41iHlpqnwrl4puQwXWETtJTEdTi-uR6doRiTEcPSOI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1242 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI163S619iUTTs1DRCJUrp43Nutt3LrBnhsWmzF-r3we8_g= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1243 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16SikajvzVA5OXLAR1plgs1OpeZpGvWkBbL6pEQQzseWY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1244 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16bR3OkMogEfrqwiHIPFS1G8qSzbA3OzRizISozfX8yn8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1245 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16Ikz9NrEog9Iab0ACUAMKFprssfdTaWu0Sn-WfcoA4yQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB2_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120448 - LdsInitCVgprs: false - LdsNumBytes: 120448 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1246 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB2_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x128_MI16xq2mNsqje0dAsoGXSlhSzpzKYmt1D4fA2PGazwbV3lwY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC3_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13312 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 25088 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 13312 - LdsOffsetMetadata_Blk: 25088 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1247 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC3_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x512x32_MI16U44d2XEmaaLr7fxfEIJvdiGdIfQ7-6dJT0GInqIzp2E= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 107776 - LdsInitCVgprs: false - LdsNumBytes: 107776 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 73984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 73984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 512 - MacroTileA: 128 - MacroTileB: 512 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1248 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x256_MI16xxe0lbF1wYDgdwMiICCtw-8ZmcwHCc0IyKMRBC_Z4OuA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1249 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x224x64_MI16xdvP8knMbPwcl7ZvrEk-6U7Slr8ePNn9-ax9TSWYIvFQ= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 40960 - LdsInitCVgprs: false - LdsNumBytes: 40960 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 35840 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 70656 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 40960 - LdsOffsetMetadata_Blk: 70656 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 7] - MIWaveTileA: 1 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 224 - MacroTileA: 32 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 28 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 1 - NumLoadsB: 14 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 14 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1250 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 7 - ThreadTileA: 4 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x320x64_MI16xQcS-IB5os2-njnft6oKfGtmZilIzSH7Yf8qnoMMZoUw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 55808 - LdsInitCVgprs: false - LdsNumBytes: 55808 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 51200 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 70144 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 55808 - LdsOffsetMetadata_Blk: 70144 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 320 - MacroTileA: 32 - MacroTileB: 320 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 1 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1251 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC16_WGMXCCGn1 - SourceSwap: 0 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x384x64_MI32xVmLymHmALnw8uJzr2rIVhG01yzw4b-CihZZAolB_J94= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x384x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 - LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 52224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 74752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 74752 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 6] - MIWaveTileA: 1 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 384 - MacroTileA: 64 - MacroTileB: 384 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 2 - NumLoadsB: 12 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 12 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1252 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x384x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 6 - ThreadTileA: 16 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16xljLkR-MC2Kp7r4mQWnSgh-_rAHdYEmvYJ0vhf_HZrbE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 - LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1253 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x16x128_MI16xY_fB8yBjfBVTqf21JvYZMtCM0wbalgcOF0cbbjUrcj8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 55808 - LdsInitCVgprs: false - LdsNumBytes: 55808 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 51200 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1254 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x256_MI16xK_IdtcbXjwpCsQHhFBzb3dEZoarGsVFXe0p_YKxZSjo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1255 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x32_MI32hF-nawwsg1CZEExwm7AU15iYRlpBXTHaGjx3S2MwWFw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58880 - LdsInitCVgprs: false - LdsNumBytes: 58880 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 8] - MIWaveTileA: 1 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 128 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1256 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM48_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x64_MI16x1RJsBXy1EedBpQvnlTww6ISCXq0R26MRxhWKF54ouZ_Q= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29440 - LdsInitCVgprs: false - LdsNumBytes: 29440 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1257 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x64x128_MI16xYDO1iJj3E-JDD2XpyQJtuSnyKAvVRi3BkhUYaZqQeuY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 55808 - LdsInitCVgprs: false - LdsNumBytes: 55808 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 37376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 37376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1258 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16xKcoHi0rxFknrT_7yBPgOnz2MfWxKC3xFrC0Ni7xVUzI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 - LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1259 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x128_MI16xlFDa2K3MuMsbiAffy908Ci3OR10WpBziR3hLTuSX7z0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 - LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1260 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16xIKXHmd8gZNhv-zbcLN0faFPYz43Jrl2CV2wrzpFB0a8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 - LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1261 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16xjTnWDBj9gfbgCPi_eVN5gKALAsymxsBI1IBw6MBcv4Q= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 - LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1262 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x48x128_MI16xh6jJh0ws8zzps7fYbjhs6hy_-3ZUYAAUjeK9k60roG8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32256 - LdsInitCVgprs: false - LdsNumBytes: 32256 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 13824 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32256 - LdsOffsetMetadata_Blk: 51200 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 48 - MacroTileA: 64 - MacroTileB: 48 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1263 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI32Rffl6ymUFAp98jlfV4YXC6pdaxdjkw2_VHdOmM4jHqo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 136192 - LdsInitCVgprs: false - LdsNumBytes: 136192 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 68096 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1264 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 128 - ThreadTile1: 2 - ThreadTileA: 128 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x320x64_MI32bsLqIqAcetg6HG7D18y2X07rgxcu8_ErA6h27oGBrIs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x320x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 159744 - LdsInitCVgprs: false - LdsNumBytes: 159744 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 79872 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 113664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 113664 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 320 - MacroTileA: 256 - MacroTileB: 320 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 320 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 8 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1265 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x320x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 5 - ThreadTileA: 64 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT304x256x32_MI16Tjjrwdw9U46olzLPfOBMR5EhgMS12b4ye0cB8SpssnA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT304x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT19_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 108288 - LdsInitCVgprs: false - LdsNumBytes: 108288 - LdsNumElementsAlignedA: 24320 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 24320 - LdsOffsetB_Blk: 89856 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24320 - LdsOffsetMetadata_Blk: 89856 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [19, 4] - MIWaveTileA: 19 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 304 - MacroTile1: 256 - MacroTileA: 304 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 304 - NumGlobalWriteVectorsPerThread: 304 - NumLoadsA: 19 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 19 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1266 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT304x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT19_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 76 - ThreadTile1: 4 - ThreadTileA: 76 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x224x64_MI1648bVaAUzsdymA40ZBReGgUBY9wSXQKOIE6GYdWx-uok= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 21120 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 21120 - LdsOffsetB_Blk: 86656 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 21120 - LdsOffsetMetadata_Blk: 86656 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 7] - MIWaveTileA: 5 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 224 - MacroTileA: 160 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 140 - NumGlobalWriteVectorsPerThread: 140 - NumLoadsA: 5 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1267 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 7 - ThreadTileA: 20 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xa6ICeA7RTUgxrRVggJel5U7xK5T7ToXE1EmSGkbS5_k= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 - LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1268 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT352x192x64_MI16l1vEnPkVuHET7NK2ATDb0pU-YiGihu1HUgRCpu7gOA4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT352x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT11_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 83968 - LdsInitCVgprs: false - LdsNumBytes: 83968 - LdsNumElementsAlignedA: 56320 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 56320 - LdsOffsetB_Blk: 187392 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 83968 - LdsOffsetMetadata_Blk: 187392 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [11, 6] - MIWaveTileA: 11 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 352 - MacroTile1: 192 - MacroTileA: 352 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 264 - NumGlobalWriteVectorsPerThread: 264 - NumLoadsA: 11 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 11 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1269 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT352x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT11_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 44 - ThreadTile1: 6 - ThreadTileA: 44 - ThreadTileB: 6 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT384x192x64_MI32Zpo8goNvpSlOIM0c0x1NSjHMZwETEAcX_HCkN0AJHWE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x192x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 159744 - LdsInitCVgprs: false - LdsNumBytes: 159744 - LdsNumElementsAlignedA: 52224 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 79872 - LdsOffsetB: 52224 - LdsOffsetB_Blk: 132096 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 52224 - LdsOffsetMetadata_Blk: 132096 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 3] - MIWaveTileA: 6 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 384 - MacroTile1: 192 - MacroTileA: 384 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 288 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 12 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 12 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1270 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x192x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 96 - ThreadTile1: 3 - ThreadTileA: 96 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x320x64_MI16IbznA20tkEc7Vj4IJ67XlE06oSnbtJs8HSzot_SXq6s= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 81920 - LdsInitCVgprs: false - LdsNumBytes: 81920 - LdsNumElementsAlignedA: 35840 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 35840 - LdsOffsetB_Blk: 166912 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 81920 - LdsOffsetMetadata_Blk: 166912 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [7, 10] - MIWaveTileA: 7 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 320 - MacroTileA: 224 - MacroTileB: 320 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 280 - NumGlobalWriteVectorsPerThread: 280 - NumLoadsA: 7 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1271 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 10 - ThreadTileA: 28 - ThreadTileB: 10 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x256_MI16xvu2kQO08CL2bF1aHWqWlZ9pMX10-g7H0PnZ6cGUDiEA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 - LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 256 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1272 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1F1fhxsSSmWc3n3OLho1FVytNcfOtjYTn4OR17BAGQ_g= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24576 - LdsInitCVgprs: false - LdsNumBytes: 24576 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1273 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x352x64_MI16CsJoK9idtPSUD8Ty7Ztn7FpFCVuWcTWXdguIEd4CJv0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x352x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 157440 - LdsInitCVgprs: false - LdsNumBytes: 157440 - LdsNumElementsAlignedA: 32256 - LdsNumElementsAlignedB: 46464 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 78720 - LdsOffsetB: 32256 - LdsOffsetB_Blk: 110976 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32256 - LdsOffsetMetadata_Blk: 110976 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [7, 11] - MIWaveTileA: 7 - MIWaveTileB: 11 - MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 352 - MacroTileA: 224 - MacroTileB: 352 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 308 - NumGlobalWriteVectorsPerThread: 308 - NumLoadsA: 28 - NumLoadsB: 11 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 28 - NumLoadsPerpendicularB: 11 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1274 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x352x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 11 - ThreadTileA: 28 - ThreadTileB: 11 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x224x64_MI16N1M5rkedW9IIBVtKqKgnIaGtLErocWo9rCj4fQQxbBI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 151296 - LdsInitCVgprs: false - LdsNumBytes: 151296 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 29568 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 75648 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 121728 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 121728 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [10, 7] - MIWaveTileA: 10 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 224 - MacroTileA: 320 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 280 - NumGlobalWriteVectorsPerThread: 140 - NumLoadsA: 40 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 40 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1275 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 7 - ThreadTileA: 40 - ThreadTileB: 7 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16RHMEUqyJMPgtxlEMcaBCkYo-nIQbn_jLPePb0FrByaY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1276 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x224x64_MI16QWWiu2I6N3q4RKm0bYWYGdmLju8YmPdMM2od6IXGxUA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 81920 - LdsInitCVgprs: false - LdsNumBytes: 81920 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 35840 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 177152 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 81920 - LdsOffsetMetadata_Blk: 177152 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [10, 7] - MIWaveTileA: 10 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 224 - MacroTileA: 320 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 280 - NumGlobalWriteVectorsPerThread: 140 - NumLoadsA: 10 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1277 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 7 - ThreadTileA: 40 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x0QZ3Xd_U4bAcjnf2AhgHkVcKZt4sFFcqMwG-cj7oUs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1278 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16xUdp65sJqNoqnh6_4ZUgmPACJNOhuYax1SY1F1-WMB5U= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumTotalPackedLoadsA: 2 - NumTotalPackedLoadsB: 4 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1279 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x128_MI16zpFtN9LdkBZoDzbM36b0TsUCZeT7PdmyQPjRpwZgPFA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 4 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1280 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x32_MI16510iu-d4Np1RpdbP6m1mk7l4bnEoYz1VU7F44UA8jng= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 54656 - LdsInitCVgprs: false - LdsNumBytes: 54656 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 3] - MIWaveTileA: 8 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 3 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1281 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16F3kReQ2eH9TE-qSkS5O0f5m_mKkDGWkDtpg_-LiGDPo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 - LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1282 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x352x64_MI162pLXduUvBQP0-lV9atvoPb18J3juYG1iEYmbEfJdyzs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x352x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_11_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 83968 - LdsInitCVgprs: false - LdsNumBytes: 83968 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 56320 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 158720 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 83968 - LdsOffsetMetadata_Blk: 158720 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 11] - MIWaveTileA: 6 - MIWaveTileB: 11 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 352 - MacroTileA: 192 - MacroTileB: 352 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 264 - NumGlobalWriteVectorsPerThread: 132 - NumLoadsA: 6 - NumLoadsB: 11 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 11 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1283 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x352x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_11_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 11 - ThreadTileA: 24 - ThreadTileB: 11 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16Ars6JuwHiKDSsjI9NlZXKxYKv7f9EZfkB00w5X7ciWk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30720 - LdsInitCVgprs: false - LdsNumBytes: 30720 - LdsNumElementsAlignedA: 15360 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 15360 - LdsOffsetB_Blk: 48128 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 30720 - LdsOffsetMetadata_Blk: 48128 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 6] - MIWaveTileA: 6 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 192 - MacroTileA: 192 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1284 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 6 - ThreadTileA: 24 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x288x64_MI16gYi6KcVjhFqquzIAsZ7MfVfRHg_pbA_zwydTMzBA1T8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 73728 - LdsInitCVgprs: false - LdsNumBytes: 73728 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 46080 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 158720 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 73728 - LdsOffsetMetadata_Blk: 158720 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 9] - MIWaveTileA: 6 - MIWaveTileB: 9 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 288 - MacroTileA: 192 - MacroTileB: 288 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 216 - NumGlobalWriteVectorsPerThread: 108 - NumLoadsA: 6 - NumLoadsB: 9 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 9 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1285 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 9 - ThreadTileA: 24 - ThreadTileB: 9 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16qGg4gMNzi8gLYbcwfQtpkFvZdpiVCgTgsuPxf3sOe7E= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59264 - LdsInitCVgprs: false - LdsNumBytes: 59264 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 12672 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 46592 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 13824 - LdsOffsetMetadata_Blk: 46592 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 3] - MIWaveTileA: 12 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 192 - MacroTileA: 192 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 12 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 12 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumTotalPackedLoadsA: 12 - NumTotalPackedLoadsB: 3 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1286 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 3 - ThreadTileA: 48 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16Taiofo8LsVqbUFqVYq-ILVex9ddEbciO8EKO2rFKNt0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61952 - LdsInitCVgprs: false - LdsNumBytes: 61952 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 46592 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 13824 - LdsOffsetMetadata_Blk: 46592 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 3] - MIWaveTileA: 12 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 192 - MacroTileA: 192 - MacroTileB: 192 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1287 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 3 - ThreadTileA: 48 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16Z_kSdkcI4g5s7lc4MHuNzxTskYaE1_oj4Ot5voiNmXg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126464 - LdsInitCVgprs: false - LdsNumBytes: 126464 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 4] - MIWaveTileA: 12 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1288 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16ZdvaJkAZRurjQXNvGSJkYtmmK3QsUQdjL6rt-AFyQLc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 25344 - LdsNumElementsAlignedB: 42240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 25344 - LdsOffsetB_Blk: 92928 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25344 - LdsOffsetMetadata_Blk: 92928 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 5] - MIWaveTileA: 12 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 320 - MacroTileA: 192 - MacroTileB: 320 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 6 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumTotalPackedLoadsA: 6 - NumTotalPackedLoadsB: 10 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1289 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 5 - ThreadTileA: 48 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16auSttg0EKVCahHTsCVhh2cHh6K9v0owM5LZjw9upYO0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 - LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 25344 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25344 - LdsOffsetB_Blk: 90880 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25344 - LdsOffsetMetadata_Blk: 90880 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 4] - MIWaveTileA: 12 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: 6 - NumTotalPackedLoadsB: 8 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1290 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1-1hEVZXQwB7Hws6RaqtCFGEV7Acaqu3sY_RqiOOC7RE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 136192 - LdsInitCVgprs: false - LdsNumBytes: 136192 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 68096 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1291 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16FNZI1UQPjZLy0q7-OOgjKhZRyX00yKIq5CfnQfl_V3c= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26624 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1292 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16Gjd69lk-6PFMJ4hivbEFNUHWuAsTA0DZOYw6wfKBqEk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 - LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1293 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 2 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI162KTaXr7l0KxzQ5okfYZt6ELI8DjoxXRRlCeFBY7TRq0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 - LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 2 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1294 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 2 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16kY8ELbPpI_HvuAM-0iLbQ3XhdpmE6DmGEqo9Lq22fls= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 8 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1295 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16VB2tykcLAxxPQCIrT_2sMKERzY3mGwD7KIGoTP2KAu0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53248 - LdsInitCVgprs: false - LdsNumBytes: 53248 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 35840 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 53248 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 7] - MIWaveTileA: 4 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 224 - MacroTileA: 128 - MacroTileB: 224 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 112 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 7 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1296 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 7 - ThreadTileA: 16 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI165u1frtcv6LOOFyTOroan3pGoqINhRgHC2_Td60yfhm0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 - LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1297 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1GS5kxWaKBHhotJrg4AiEHwBFGMMKDZHMHRwyi5luodo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 2048 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 136192 - LdsInitCVgprs: false - LdsNumBytes: 136192 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 33280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 68096 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 102912 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 102912 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 8] - MIWaveTileA: 2 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1298 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3NvZvxpGo9E4DLXcNaAG_Q1fEQ_9-rCX24hEk7ZcKb8U= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1299 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x256x64_MI16AooNQ0DxCE1iMNXzqX0FHV8K0vAlx-m-ncqv9HuoLPs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 - LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 35840 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 35840 - LdsOffsetB_Blk: 105472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35840 - LdsOffsetMetadata_Blk: 105472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [7, 8] - MIWaveTileA: 7 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 256 - MacroTileA: 224 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 224 - NumLoadsA: 7 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1300 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 8 - ThreadTileA: 28 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3rh3QQEBSU5ZwlIzZgLp80EZwlPEoe5VChZIZ7RQ89CY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 136192 - LdsInitCVgprs: false - LdsNumBytes: 136192 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 68096 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1301 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 1 - ThreadTileA: 64 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI32Jze6abHJ8l5M0yYk0uV7vwVwvrcqi7qDRWfQz8wCrH8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 - LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1302 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI3207RbkMHStFD_YoILeumHizH3M8jeERDgPC5Q3QefifM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 - LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1303 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32XwDoh2Lmq9ee-Smh9-VB0ZyuLvkIBEspEN9Q2R_QhGo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 - LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1304 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI165JlqJQiebX2lT4lUU-UJbJ4riuk32JddM1ZpBJ38BuA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 - LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 8 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1305 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 8 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16ioP1gzZP2YiHyhxAp-lrgdDadhVY-tEt9vfS_ve4KV4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 137216 - LdsInitCVgprs: false - LdsNumBytes: 137216 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 68608 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 103424 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 103424 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 16] - MIWaveTileA: 4 - MIWaveTileB: 16 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1306 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 16 - ThreadTileA: 16 - ThreadTileB: 16 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x128x32_MI32CpkerEf2mAVVrDatpBRwuRndjKbhR0RH7EPU5eAZMNc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 109056 - LdsInitCVgprs: false - LdsNumBytes: 109056 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 8704 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 512 - MacroTile1: 128 - MacroTileA: 512 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1307 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT23wJ8qNpK8Q_UogSvwK03hvcukDrQtjltSVIFXFfA4es= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: false - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x240x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 1024 - LdsBytesNoAmax: 136704 - LdsInitCVgprs: false - LdsNumBytes: 136704 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 34560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 68352 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 102144 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 102144 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: -1 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 15] - MIWaveTileA: 4 - MIWaveTileB: 15 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 240 - MacroTileA: 256 - MacroTileB: 240 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 8 - NumLoadsB: 30 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 30 - NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 30 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1308 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x240x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCCn1_WGMXCCGn1 - SourceSwap: true - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: true - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 15 - ThreadTileA: 16 - ThreadTileB: 15 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: true - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: -1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT2GQQfPVRirXfdyXyJUWH87knWjo4-PQrW8_xmR_z5WEU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: false - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x208x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 1024 - LdsBytesNoAmax: 129280 - LdsInitCVgprs: false - LdsNumBytes: 129280 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 29952 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: -1 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 13] - MIWaveTileA: 4 - MIWaveTileB: 13 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 208 - MacroTileA: 256 - MacroTileB: 208 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 208 - NumGlobalWriteVectorsPerThread: 52 - NumLoadsA: 8 - NumLoadsB: 26 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 26 - NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 26 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1309 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x208x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: true - SpaceFillingAlgo: [] - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 13 - ThreadTileA: 16 - ThreadTileB: 13 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: true - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2GGMqhPpxk2J1Q1819y_V3YsuUalVXKAw0Crcj6ZVwr0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 128896 - LdsInitCVgprs: false - LdsNumBytes: 128896 - LdsNumElementsAlignedA: 29568 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 29568 - LdsOffsetB_Blk: 95104 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 29568 - LdsOffsetMetadata_Blk: 95104 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [7, 8] - MIWaveTileA: 7 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 256 - MacroTileA: 224 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: true - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 224 - NumLoadsA: 7 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: 7 - NumTotalPackedLoadsB: 8 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1310 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 8 - ThreadTileA: 28 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1iMO6yjTG4yuZPDkcMXrcObwuUOXV_ipLalV5INSQqYE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123776 - LdsInitCVgprs: false - LdsNumBytes: 123776 - LdsNumElementsAlignedA: 24960 - LdsNumElementsAlignedB: 33280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 24960 - LdsOffsetB_Blk: 90496 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24960 - LdsOffsetMetadata_Blk: 90496 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: 6 - NumTotalPackedLoadsB: 8 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1311 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false -- [2, 3, 0, 1] -- - - [16, 368640, 1, 224] - - [0, 0.0] - - - [121, 491520, 1, 105] - - [1256, 0.0] - - - [256, 1280, 1, 8192] - - [950, 0.0] - - - [128, 3072, 1, 440] - - [1258, 0.0] - - - [160, 3072, 1, 316] - - [1, 0.0] - - - [384, 245760, 1, 648] - - [2, 0.0] - - - [208, 491520, 1, 1224] - - [3, 0.0] - - - [256, 3072, 1, 316] - - [1261, 0.0] - - - [256, 3072, 1, 440] - - [1262, 0.0] - - - [256, 3072, 1, 888] - - [1263, 0.0] - - - [2048, 1280, 1, 20512] - - [946, 0.0] - - - [1032, 245760, 1, 4096] - - [4, 0.0] - - - [304, 368640, 1, 624] - - [5, 0.0] - - - [512, 1792, 1, 2048] - - [6, 0.0] - - - [320, 3072, 1, 316] - - [7, 0.0] - - - [320, 3072, 1, 440] - - [8, 0.0] - - - [320, 3072, 1, 888] - - [1268, 0.0] - - - [352, 245760, 1, 3600] - - [9, 0.0] - - - [400, 3072, 1, 512] - - [10, 0.0] - - - [440, 3072, 1, 256] - - [11, 0.0] - - - [424, 286720, 1, 1872] - - [12, 0.0] - - - [512, 3072, 1, 72] - - [13, 0.0] - - - [512, 3072, 1, 240] - - [1273, 0.0] - - - [512, 3072, 1, 2048] - - [14, 0.0] - - - [768, 3072, 1, 2048] - - [15, 0.0] - - - [768, 3072, 1, 3840] - - [16, 0.0] - - - [768, 3072, 1, 4480] - - [953, 0.0] - - - [768, 3072, 1, 5120] - - [17, 0.0] - - - [888, 3072, 1, 256] - - [18, 0.0] - - - [1024, 1280, 1, 5632] - - [19, 0.0] - - - [1024, 1792, 1, 13184] - - [949, 0.0] - - - [1024, 1792, 1, 64704] - - [952, 0.0] - - - [2304, 1280, 1, 9600] - - [20, 0.0] - - - [1024, 3072, 1, 4800] - - [942, 0.0] - - - [1280, 1568, 1, 512] - - [21, 0.0] - - - [1536, 1792, 1, 3200] - - [22, 0.0] - - - [1536, 3072, 1, 4200] - - [945, 0.0] - - - [1536, 3072, 1, 19776] - - [23, 0.0] - - - [1880, 3072, 1, 512] - - [24, 0.0] - - - [1920, 3072, 1, 2048] - - [25, 0.0] - - - [2048, 1792, 1, 8224] - - [26, 0.0] - - - [2048, 3072, 1, 1880] - - [27, 0.0] - - - [3200, 1792, 1, 64704] - - [1275, 0.0] - - - [3264, 3072, 1, 256] - - [28, 0.0] - - - [3840, 1280, 1, 5692] - - [29, 0.0] - - - [9600, 1280, 1, 5632] - - [30, 0.0] - - - [4200, 3072, 1, 256] - - [31, 0.0] - - - [98304, 1280, 1, 9600] - - [1278, 0.0] - - - [264, 160, 1792, 32] - - [32, 0.0] - - - [192, 4090, 1280, 32] - - [33, 0.0] - - - [4090, 192, 1280, 32] - - [34, 0.0] - - - [257, 160, 1792, 32] - - [35, 0.0] - - - [192, 641, 1280, 32] - - [36, 0.0] - - - [641, 192, 1280, 32] - - [37, 0.0] - - - [2126, 192, 1280, 32] - - [38, 0.0] - - - [160, 286720, 1, 257] - - [39, 0.0] - - - [20480, 1792, 1, 3200] - - [1277, 0.0] - - - [256, 1792, 1, 256] - - [40, 0.0] - - - [256, 1280, 1, 256] - - [41, 0.0] - - - [384, 245760, 1, 641] - - [1270, 0.0] - - - [5632, 1280, 1, 1024] - - [1276, 0.0] - - - [120, 3072, 1, 96] - - [42, 0.0] - - - [120, 3072, 1, 120] - - [43, 0.0] - - - [120, 3072, 1, 160] - - [1254, 0.0] - - - [160, 1219, 3072, 40] - - [44, 0.0] - - - [1219, 160, 3072, 40] - - [45, 0.0] - - - [208, 491520, 1, 1219] - - [46, 0.0] - - - [128, 491520, 1, 112] - - [47, 0.0] - - - [1920, 1280, 1, 1920] - - [48, 0.0] - - - [24, 3072, 1, 256] - - [49, 0.0] - - - [2048, 1792, 1, 59744] - - [947, 0.0] - - - [120, 1024, 1, 64] - - [50, 0.0] - - - [32, 491520, 1, 105] - - [1251, 0.0] - - - [64, 491520, 1, 1224] - - [51, 0.0] - - - [256, 1280, 1, 49152] - - [943, 0.0] - - - [120, 3072, 1, 144] - - [1253, 0.0] - - - [120, 3072, 1, 240] - - [52, 0.0] - - - [120, 3072, 1, 512] - - [1255, 0.0] - - - [125, 3072, 1, 320] - - [1257, 0.0] - - - [256, 1792, 1, 5120] - - [948, 0.0] - - - [192, 1280, 1, 2048] - - [53, 0.0] - - - [160, 3072, 1, 2048] - - [54, 0.0] - - - [160, 286720, 1, 264] - - [55, 0.0] - - - [256, 245760, 1, 648] - - [56, 0.0] - - - [256, 3072, 1, 7524] - - [941, 0.0] - - - [256, 10240, 1, 256] - - [57, 0.0] - - - [1025, 245760, 1, 2126] - - [58, 0.0] - - - [512, 1280, 1, 240] - - [59, 0.0] - - - [512, 3072, 1, 3000] - - [60, 0.0] - - - [840, 3072, 1, 256] - - [61, 0.0] - - - [1536, 3072, 1, 3264] - - [62, 0.0] - - - [1536, 3072, 1, 48760] - - [1274, 0.0] - - - [2048, 3072, 1, 2560] - - [63, 0.0] - - - [3200, 1792, 1, 1536] - - [64, 0.0] - - - [3200, 1792, 1, 13184] - - [65, 0.0] - - - [6400, 3072, 1, 256] - - [66, 0.0] - - - [7524, 3072, 1, 256] - - [67, 0.0] - - - [9600, 1280, 1, 2304] - - [68, 0.0] - - - [20512, 1280, 1, 256] - - [69, 0.0] - - - [20512, 1280, 1, 1024] - - [70, 0.0] - - - [5120, 1792, 1, 256] - - [71, 0.0] - - - [8224, 1792, 1, 256] - - [72, 0.0] - - - [8224, 1792, 1, 1024] - - [73, 0.0] - - - [13184, 1792, 1, 1024] - - [74, 0.0] - - - [48760, 3072, 1, 256] - - [75, 0.0] - - - [68032, 1280, 1, 256] - - [76, 0.0] - - - [68032, 1280, 1, 1024] - - [77, 0.0] - - - [160, 257, 1792, 32] - - [78, 0.0] - - - [192, 2126, 1280, 32] - - [79, 0.0] - - - [120, 102, 3072, 32] - - [80, 0.0] - - - [105, 160, 3072, 40] - - [81, 0.0] - - - [160, 105, 3072, 40] - - [82, 0.0] - - - [120, 3072, 1, 64] - - [83, 0.0] - - - [120, 3072, 1, 72] - - [84, 0.0] - - - [128256, 32768, 1, 4096] - - [85, 0.0] - - - [4096, 4096, 1, 4096] - - [86, 0.0] - - - [4096, 22080, 1, 4096] - - [87, 0.0] - - - [16384, 4096, 1, 4096] - - [88, 0.0] - - - [102, 120, 3072, 32] - - [89, 0.0] - - - [120, 618, 3072, 32] - - [90, 0.0] - - - [618, 120, 3072, 32] - - [91, 0.0] - - - [256, 232, 1280, 256] - - [92, 0.0] - - - [256, 532, 1280, 256] - - [93, 0.0] - - - [192, 641, 1024, 32] - - [94, 0.0] - - - [192, 2126, 1024, 32] - - [95, 0.0] - - - [641, 192, 1024, 32] - - [96, 0.0] - - - [2126, 192, 1024, 32] - - [97, 0.0] - - - [160, 1867, 1792, 32] - - [98, 0.0] - - - [1867, 160, 1792, 32] - - [99, 0.0] - - - [1, 1792, 1, 512] - - [100, 0.0] - - - [32, 1792, 1, 128] - - [101, 0.0] - - - [128, 1792, 1, 2048] - - [102, 0.0] - - - [256, 1280, 1, 5692] - - [944, 0.0] - - - [1536, 1280, 1, 5692] - - [103, 0.0] - - - [5692, 1280, 1, 256] - - [104, 0.0] - - - [15, 368640, 1, 72] - - [105, 0.0] - - - [32, 368640, 1, 102] - - [106, 0.0] - - - [54, 368640, 1, 618] - - [107, 0.0] - - - [118, 368640, 1, 102] - - [108, 0.0] - - - [128, 3072, 1, 48] - - [109, 0.0] - - - [128, 3072, 1, 60] - - [110, 0.0] - - - [128, 3072, 1, 360] - - [111, 0.0] - - - [128, 3072, 1, 1320] - - [112, 0.0] - - - [240, 3072, 1, 40] - - [113, 0.0] - - - [240, 3072, 1, 48] - - [114, 0.0] - - - [240, 3072, 1, 60] - - [115, 0.0] - - - [240, 3072, 1, 360] - - [1260, 0.0] - - - [240, 3072, 1, 1320] - - [116, 0.0] - - - [256, 3072, 1, 768] - - [117, 0.0] - - - [256, 3072, 1, 1800] - - [118, 0.0] - - - [304, 368640, 1, 618] - - [1266, 0.0] - - - [480, 3072, 1, 240] - - [119, 0.0] - - - [504, 3072, 1, 256] - - [120, 0.0] - - - [768, 3072, 1, 2560] - - [121, 0.0] - - - [1024, 3072, 1, 2048] - - [122, 0.0] - - - [2048, 3072, 1, 768] - - [123, 0.0] - - - [2048, 3072, 1, 1024] - - [124, 0.0] - - - [2560, 3072, 1, 768] - - [125, 0.0] - - - [3840, 3072, 1, 256] - - [126, 0.0] - - - [19776, 3072, 1, 256] - - [127, 0.0] - - - [32, 286720, 1, 264] - - [128, 0.0] - - - [192, 1280, 1, 512] - - [129, 0.0] - - - [512, 1280, 1, 32] - - [130, 0.0] - - - [512, 1280, 1, 60] - - [131, 0.0] - - - [512, 1280, 1, 64] - - [132, 0.0] - - - [512, 1280, 1, 72] - - [133, 0.0] - - - [512, 1280, 1, 96] - - [134, 0.0] - - - [512, 1280, 1, 120] - - [135, 0.0] - - - [512, 1280, 1, 144] - - [136, 0.0] - - - [512, 1280, 1, 160] - - [137, 0.0] - - - [512, 1280, 1, 192] - - [138, 0.0] - - - [1562, 1280, 1, 512] - - [139, 0.0] - - - [160, 1792, 1, 512] - - [140, 0.0] - - - [160, 1792, 1, 2048] - - [141, 0.0] - - - [512, 1792, 1, 60] - - [142, 0.0] - - - [512, 1792, 1, 64] - - [143, 0.0] - - - [512, 1792, 1, 72] - - [144, 0.0] - - - [512, 1792, 1, 96] - - [145, 0.0] - - - [2895, 1792, 1, 512] - - [146, 0.0] - - - [256, 450560, 1, 192] - - [147, 0.0] - - - [256, 296960, 1, 256] - - [148, 0.0] - - - [256, 680960, 1, 256] - - [149, 0.0] - - - [384, 57600, 1, 256] - - [150, 0.0] - - - [512, 40960, 1, 256] - - [151, 0.0] - - - [512, 296960, 1, 256] - - [152, 0.0] - - - [512, 680960, 1, 256] - - [153, 0.0] - - - [65536, 1280, 1, 256] - - [154, 0.0] - - - [120, 1024, 1, 72] - - [155, 0.0] - - - [120, 1024, 1, 120] - - [156, 0.0] - - - [120, 1024, 1, 144] - - [157, 0.0] - - - [120, 1024, 1, 160] - - [158, 0.0] - - - [120, 1024, 1, 240] - - [159, 0.0] - - - [120, 1024, 1, 512] - - [160, 0.0] - - - [125, 1024, 1, 320] - - [161, 0.0] - - - [256, 1024, 1, 7524] - - [162, 0.0] - - - [512, 1024, 1, 3000] - - [1272, 0.0] - - - [7524, 1024, 1, 256] - - [163, 0.0] - - - [128, 1792, 1, 112] - - [164, 0.0] - - - [128, 1792, 1, 2352] - - [165, 0.0] - - - [128, 1792, 1, 20224] - - [166, 0.0] - - - [160, 1792, 1, 804] - - [167, 0.0] - - - [320, 1792, 1, 112] - - [168, 0.0] - - - [320, 1792, 1, 380] - - [169, 0.0] - - - [320, 1792, 1, 804] - - [170, 0.0] - - - [320, 1792, 1, 2352] - - [171, 0.0] - - - [320, 1792, 1, 20224] - - [1267, 0.0] - - - [1, 3072, 1, 512] - - [172, 0.0] - - - [32, 3072, 1, 2048] - - [173, 0.0] - - - [256, 1024, 1, 256] - - [174, 0.0] - - - [256, 1024, 1, 49152] - - [175, 0.0] - - - [256, 196608, 1, 648] - - [176, 0.0] - - - [384, 196608, 1, 641] - - [177, 0.0] - - - [1024, 1024, 1, 5632] - - [951, 0.0] - - - [1032, 196608, 1, 2128] - - [178, 0.0] - - - [2048, 1024, 1, 20512] - - [179, 0.0] - - - [2048, 1024, 1, 68032] - - [180, 0.0] - - - [2304, 1024, 1, 9600] - - [181, 0.0] - - - [5632, 1024, 1, 1024] - - [182, 0.0] - - - [9600, 1024, 1, 2304] - - [183, 0.0] - - - [9600, 1024, 1, 5632] - - [184, 0.0] - - - [20512, 1024, 1, 256] - - [185, 0.0] - - - [20512, 1024, 1, 1024] - - [186, 0.0] - - - [68032, 1024, 1, 256] - - [187, 0.0] - - - [68032, 1024, 1, 1024] - - [188, 0.0] - - - [417, 286720, 1, 1867] - - [1271, 0.0] - - - [59744, 1792, 1, 256] - - [189, 0.0] - - - [59744, 1792, 1, 1024] - - [190, 0.0] - - - [64704, 1792, 1, 1024] - - [191, 0.0] - - - [352, 245760, 1, 3595] - - [1269, 0.0] - - - [10, 368640, 1, 10] - - [192, 0.0] - - - [10, 368640, 1, 221] - - [193, 0.0] - - - [256, 3072, 1, 2722] - - [194, 0.0] - - - [512, 3072, 1, 256] - - [195, 0.0] - - - [1024, 3072, 1, 18992] - - [196, 0.0] - - - [4800, 3072, 1, 1024] - - [197, 0.0] - - - [1, 1792, 1, 128] - - [198, 0.0] - - - [128, 358400, 1, 128] - - [199, 0.0] - - - [128, 57344, 1, 128] - - [200, 0.0] - - - [128, 57344, 1, 256] - - [201, 0.0] - - - [256, 57344, 1, 128] - - [202, 0.0] - - - [128, 1792, 128, 160] - - [203, 0.0] - - - [256, 1280, 1, 768] - - [204, 0.0] - - - [256, 1280, 1, 1664] - - [205, 0.0] - - - [256, 26480, 1, 640] - - [206, 0.0] - - - [256, 189360, 1, 512] - - [207, 0.0] - - - [256, 248960, 1, 1024] - - [208, 0.0] - - - [256, 256000, 1, 256] - - [209, 0.0] - - - [256, 600640, 1, 640] - - [210, 0.0] - - - [256, 640000, 1, 256] - - [211, 0.0] - - - [512, 1280, 1, 512] - - [212, 0.0] - - - [640, 1280, 1, 640] - - [213, 0.0] - - - [768, 1280, 1, 768] - - [214, 0.0] - - - [1024, 1280, 1, 1024] - - [215, 0.0] - - - [1664, 1280, 1, 1664] - - [216, 0.0] - - - [48, 98304, 1, 48] - - [217, 0.0] - - - [48, 98304, 1, 128] - - [218, 0.0] - - - [48, 614400, 1, 48] - - [219, 0.0] - - - [128, 98304, 1, 48] - - [220, 0.0] - - - [48, 3072, 128, 120] - - [221, 0.0] - - - [4, 21907200, 1, 8] - - [222, 0.0] - - - [2, 20920192, 1, 10] - - [223, 0.0] - - - [4, 13744384, 1, 12] - - [224, 0.0] - - - [1, 592704, 1, 12] - - [225, 0.0] - - - [1, 786144, 1, 9] - - [226, 0.0] - - - [1, 875568, 1, 8] - - [227, 0.0] - - - [1, 925632, 1, 10] - - [228, 0.0] - - - [16, 8192, 1, 8192] - - [229, 0.0] - - - [32000, 8192, 1, 8192] - - [230, 0.0] - - - [10240, 8192, 1, 16] - - [231, 0.0] - - - [8192, 8192, 1, 16] - - [232, 0.0] - - - [128, 40, 1, 2880] - - [233, 0.0] - - - [128, 184, 1, 2880] - - [234, 0.0] - - - [128, 232, 1, 2880] - - [235, 0.0] - - - [128, 416, 1, 2880] - - [236, 0.0] - - - [128, 576, 1, 2880] - - [237, 0.0] - - - [128, 768, 1, 2880] - - [238, 0.0] - - - [128, 880, 1, 2880] - - [239, 0.0] - - - [128, 1206, 1, 2880] - - [240, 0.0] - - - [128, 6146, 1, 2880] - - [241, 0.0] - - - [640, 16, 1, 2880] - - [242, 0.0] - - - [640, 48, 1, 2880] - - [243, 0.0] - - - [640, 64, 1, 2880] - - [244, 0.0] - - - [640, 176, 1, 2880] - - [245, 0.0] - - - [640, 288, 1, 2880] - - [246, 0.0] - - - [640, 320, 1, 2880] - - [247, 0.0] - - - [640, 512, 1, 2880] - - [248, 0.0] - - - [640, 640, 1, 2880] - - [249, 0.0] - - - [640, 768, 1, 2880] - - [250, 0.0] - - - [640, 912, 1, 2880] - - [251, 0.0] - - - [640, 1206, 1, 2880] - - [252, 0.0] - - - [640, 6146, 1, 2880] - - [253, 0.0] - - - [2880, 4, 1, 4096] - - [254, 0.0] - - - [2880, 1, 1, 4096] - - [254, 0.0] - - - [2880, 2, 1, 4096] - - [254, 0.0] - - - [2880, 48, 1, 512] - - [255, 0.0] - - - [2880, 72, 1, 4096] - - [256, 0.0] - - - [2880, 112, 1, 4096] - - [257, 0.0] - - - [2880, 152, 1, 512] - - [258, 0.0] - - - [2880, 184, 1, 512] - - [259, 0.0] - - - [2880, 208, 1, 4096] - - [260, 0.0] - - - [2880, 232, 1, 512] - - [261, 0.0] - - - [2880, 272, 1, 512] - - [262, 0.0] - - - [2880, 336, 1, 512] - - [263, 0.0] - - - [2880, 400, 1, 512] - - [264, 0.0] - - - [2880, 464, 1, 4096] - - [265, 0.0] - - - [2880, 528, 1, 4096] - - [266, 0.0] - - - [2880, 608, 1, 512] - - [267, 0.0] - - - [2880, 656, 1, 4096] - - [268, 0.0] - - - [2880, 720, 1, 4096] - - [269, 0.0] - - - [2880, 784, 1, 4096] - - [270, 0.0] - - - [2880, 864, 1, 512] - - [271, 0.0] - - - [2880, 912, 1, 4096] - - [272, 0.0] - - - [2880, 976, 1, 4096] - - [273, 0.0] - - - [2880, 1206, 1, 512] - - [274, 0.0] - - - [2880, 7169, 1, 512] - - [275, 0.0] - - - [2880, 7217, 1, 4096] - - [276, 0.0] - - - [5120, 4, 1, 2880] - - [277, 0.0] - - - [5120, 1, 1, 2880] - - [277, 0.0] - - - [5120, 2, 1, 2880] - - [277, 0.0] - - - [5120, 72, 1, 2880] - - [278, 0.0] - - - [5120, 120, 1, 2880] - - [279, 0.0] - - - [5120, 176, 1, 2880] - - [280, 0.0] - - - [5120, 240, 1, 2880] - - [281, 0.0] - - - [5120, 368, 1, 2880] - - [282, 0.0] - - - [5120, 496, 1, 2880] - - [283, 0.0] - - - [5120, 624, 1, 2880] - - [284, 0.0] - - - [5120, 784, 1, 2880] - - [285, 0.0] - - - [5120, 912, 1, 2880] - - [286, 0.0] - - - [5120, 1202, 1, 2880] - - [287, 0.0] - - - [5120, 7217, 1, 2880] - - [288, 0.0] - - - [25136, 5, 1, 2880] - - [289, 0.0] - - - [25136, 15, 1, 2880] - - [290, 0.0] - - - [25136, 30, 1, 2880] - - [291, 0.0] - - - [25136, 46, 1, 2880] - - [292, 0.0] - - - [25136, 62, 1, 2880] - - [293, 0.0] - - - [25136, 85, 1, 2880] - - [294, 0.0] - - - [201088, 8, 1, 2880] - - [295, 0.0] - - - [201088, 17, 1, 2880] - - [296, 0.0] - - - [201088, 33, 1, 2880] - - [297, 0.0] - - - [201088, 64, 1, 2880] - - [298, 0.0] - - - [128, 48, 1, 2880] - - [299, 0.0] - - - [128, 176, 1, 2880] - - [300, 0.0] - - - [128, 248, 1, 2880] - - [301, 0.0] - - - [128, 304, 1, 2880] - - [302, 0.0] - - - [128, 368, 1, 2880] - - [303, 0.0] - - - [128, 496, 1, 2880] - - [304, 0.0] - - - [128, 528, 1, 2880] - - [305, 0.0] - - - [128, 592, 1, 2880] - - [306, 0.0] - - - [128, 752, 1, 2880] - - [307, 0.0] - - - [128, 896, 1, 2880] - - [308, 0.0] - - - [128, 944, 1, 2880] - - [309, 0.0] - - - [128, 1008, 1, 2880] - - [310, 0.0] - - - [128, 2050, 1, 2880] - - [311, 0.0] - - - [128, 8192, 1, 2880] - - [312, 0.0] - - - [640, 96, 1, 2880] - - [313, 0.0] - - - [640, 136, 1, 2880] - - [314, 0.0] - - - [640, 224, 1, 2880] - - [315, 0.0] - - - [640, 272, 1, 2880] - - [316, 0.0] - - - [640, 352, 1, 2880] - - [317, 0.0] - - - [640, 384, 1, 2880] - - [318, 0.0] - - - [640, 528, 1, 2880] - - [319, 0.0] - - - [640, 624, 1, 2880] - - [320, 0.0] - - - [640, 752, 1, 2880] - - [321, 0.0] - - - [640, 880, 1, 2880] - - [322, 0.0] - - - [640, 1008, 1, 2880] - - [323, 0.0] - - - [640, 8183, 1, 2880] - - [324, 0.0] - - - [2880, 24, 1, 512] - - [325, 0.0] - - - [2880, 56, 1, 512] - - [326, 0.0] - - - [2880, 88, 1, 512] - - [327, 0.0] - - - [2880, 104, 1, 4096] - - [328, 0.0] - - - [2880, 136, 1, 512] - - [329, 0.0] - - - [2880, 176, 1, 4096] - - [330, 0.0] - - - [2880, 200, 1, 512] - - [331, 0.0] - - - [2880, 240, 1, 512] - - [332, 0.0] - - - [2880, 272, 1, 4096] - - [333, 0.0] - - - [2880, 352, 1, 4096] - - [334, 0.0] - - - [2880, 416, 1, 512] - - [335, 0.0] - - - [2880, 496, 1, 512] - - [336, 0.0] - - - [2880, 560, 1, 4096] - - [337, 0.0] - - - [2880, 624, 1, 512] - - [338, 0.0] - - - [2880, 688, 1, 4096] - - [339, 0.0] - - - [2880, 768, 1, 512] - - [340, 0.0] - - - [2880, 816, 1, 4096] - - [341, 0.0] - - - [2880, 896, 1, 512] - - [342, 0.0] - - - [2880, 960, 1, 512] - - [343, 0.0] - - - [2880, 1024, 1, 4096] - - [344, 0.0] - - - [2880, 4096, 1, 512] - - [345, 0.0] - - - [2880, 7204, 1, 512] - - [346, 0.0] - - - [2880, 8184, 1, 4096] - - [347, 0.0] - - - [5120, 32, 1, 2880] - - [348, 0.0] - - - [5120, 96, 1, 2880] - - [349, 0.0] - - - [5120, 160, 1, 2880] - - [350, 0.0] - - - [5120, 224, 1, 2880] - - [351, 0.0] - - - [5120, 304, 1, 2880] - - [352, 0.0] - - - [5120, 432, 1, 2880] - - [353, 0.0] - - - [5120, 560, 1, 2880] - - [354, 0.0] - - - [5120, 688, 1, 2880] - - [355, 0.0] - - - [5120, 800, 1, 2880] - - [356, 0.0] - - - [5120, 960, 1, 2880] - - [357, 0.0] - - - [5120, 4096, 1, 2880] - - [358, 0.0] - - - [5120, 8184, 1, 2880] - - [359, 0.0] - - - [25136, 10, 1, 2880] - - [360, 0.0] - - - [25136, 24, 1, 2880] - - [361, 0.0] - - - [25136, 35, 1, 2880] - - [362, 0.0] - - - [25136, 52, 1, 2880] - - [363, 0.0] - - - [25136, 80, 1, 2880] - - [364, 0.0] - - - [201088, 5, 1, 2880] - - [365, 0.0] - - - [201088, 25, 1, 2880] - - [366, 0.0] - - - [201088, 50, 1, 2880] - - [367, 0.0] - - - [201088, 65, 1, 2880] - - [368, 0.0] - - - [128, 56, 1, 2880] - - [369, 0.0] - - - [128, 144, 1, 2880] - - [370, 0.0] - - - [128, 208, 1, 2880] - - [371, 0.0] - - - [128, 240, 1, 2880] - - [372, 0.0] - - - [128, 480, 1, 2880] - - [373, 0.0] - - - [128, 624, 1, 2880] - - [374, 0.0] - - - [128, 672, 1, 2880] - - [375, 0.0] - - - [128, 736, 1, 2880] - - [376, 0.0] - - - [128, 912, 1, 2880] - - [377, 0.0] - - - [128, 1881, 1, 2880] - - [378, 0.0] - - - [128, 8183, 1, 2880] - - [379, 0.0] - - - [640, 8, 1, 2880] - - [380, 0.0] - - - [640, 32, 1, 2880] - - [381, 0.0] - - - [640, 128, 1, 2880] - - [382, 0.0] - - - [640, 160, 1, 2880] - - [383, 0.0] - - - [640, 256, 1, 2880] - - [384, 0.0] - - - [640, 368, 1, 2880] - - [385, 0.0] - - - [640, 400, 1, 2880] - - [386, 0.0] - - - [640, 496, 1, 2880] - - [387, 0.0] - - - [640, 656, 1, 2880] - - [388, 0.0] - - - [640, 784, 1, 2880] - - [389, 0.0] - - - [640, 896, 1, 2880] - - [390, 0.0] - - - [640, 1024, 1, 2880] - - [391, 0.0] - - - [640, 7169, 1, 2880] - - [392, 0.0] - - - [2880, 16, 1, 512] - - [393, 0.0] - - - [2880, 48, 1, 4096] - - [394, 0.0] - - - [2880, 80, 1, 4096] - - [395, 0.0] - - - [2880, 112, 1, 512] - - [396, 0.0] - - - [2880, 136, 1, 4096] - - [397, 0.0] - - - [2880, 168, 1, 4096] - - [398, 0.0] - - - [2880, 208, 1, 512] - - [399, 0.0] - - - [2880, 232, 1, 4096] - - [400, 0.0] - - - [2880, 288, 1, 4096] - - [401, 0.0] - - - [2880, 336, 1, 4096] - - [402, 0.0] - - - [2880, 400, 1, 4096] - - [403, 0.0] - - - [2880, 496, 1, 4096] - - [404, 0.0] - - - [2880, 560, 1, 512] - - [405, 0.0] - - - [2880, 624, 1, 4096] - - [406, 0.0] - - - [2880, 688, 1, 512] - - [407, 0.0] - - - [2880, 736, 1, 4096] - - [408, 0.0] - - - [2880, 800, 1, 512] - - [409, 0.0] - - - [2880, 880, 1, 512] - - [410, 0.0] - - - [2880, 944, 1, 512] - - [411, 0.0] - - - [2880, 1008, 1, 512] - - [412, 0.0] - - - [2880, 2049, 1, 512] - - [413, 0.0] - - - [2880, 7169, 1, 4096] - - [414, 0.0] - - - [2880, 8183, 1, 4096] - - [415, 0.0] - - - [5120, 16, 1, 2880] - - [416, 0.0] - - - [5120, 80, 1, 2880] - - [417, 0.0] - - - [5120, 136, 1, 2880] - - [418, 0.0] - - - [5120, 208, 1, 2880] - - [419, 0.0] - - - [5120, 288, 1, 2880] - - [420, 0.0] - - - [5120, 416, 1, 2880] - - [421, 0.0] - - - [5120, 544, 1, 2880] - - [422, 0.0] - - - [5120, 672, 1, 2880] - - [423, 0.0] - - - [5120, 832, 1, 2880] - - [424, 0.0] - - - [5120, 928, 1, 2880] - - [425, 0.0] - - - [5120, 2050, 1, 2880] - - [426, 0.0] - - - [5120, 8189, 1, 2880] - - [427, 0.0] - - - [25136, 11, 1, 2880] - - [428, 0.0] - - - [25136, 25, 1, 2880] - - [429, 0.0] - - - [25136, 40, 1, 2880] - - [430, 0.0] - - - [25136, 56, 1, 2880] - - [431, 0.0] - - - [25136, 77, 1, 2880] - - [432, 0.0] - - - [201088, 6, 1, 2880] - - [433, 0.0] - - - [201088, 16, 1, 2880] - - [434, 0.0] - - - [201088, 56, 1, 2880] - - [435, 0.0] - - - [201088, 80, 1, 2880] - - [436, 0.0] - - - [128, 72, 1, 2880] - - [437, 0.0] - - - [128, 192, 1, 2880] - - [438, 0.0] - - - [128, 352, 1, 2880] - - [439, 0.0] - - - [128, 688, 1, 2880] - - [440, 0.0] - - - [128, 800, 1, 2880] - - [441, 0.0] - - - [128, 832, 1, 2880] - - [442, 0.0] - - - [128, 864, 1, 2880] - - [443, 0.0] - - - [128, 1024, 1, 2880] - - [444, 0.0] - - - [128, 2049, 1, 2880] - - [445, 0.0] - - - [128, 5790, 1, 2880] - - [446, 0.0] - - - [128, 7204, 1, 2880] - - [447, 0.0] - - - [128, 7217, 1, 2880] - - [448, 0.0] - - - [128, 8184, 1, 2880] - - [449, 0.0] - - - [640, 4, 1, 2880] - - [450, 0.0] - - - [640, 1, 1, 2880] - - [450, 0.0] - - - [640, 2, 1, 2880] - - [450, 0.0] - - - [640, 104, 1, 2880] - - [451, 0.0] - - - [640, 192, 1, 2880] - - [452, 0.0] - - - [640, 208, 1, 2880] - - [453, 0.0] - - - [640, 232, 1, 2880] - - [454, 0.0] - - - [640, 416, 1, 2880] - - [455, 0.0] - - - [640, 560, 1, 2880] - - [456, 0.0] - - - [640, 688, 1, 2880] - - [457, 0.0] - - - [640, 800, 1, 2880] - - [458, 0.0] - - - [640, 928, 1, 2880] - - [459, 0.0] - - - [640, 1881, 1, 2880] - - [460, 0.0] - - - [640, 7204, 1, 2880] - - [461, 0.0] - - - [2880, 16, 1, 4096] - - [462, 0.0] - - - [2880, 40, 1, 512] - - [463, 0.0] - - - [2880, 72, 1, 512] - - [464, 0.0] - - - [2880, 120, 1, 512] - - [465, 0.0] - - - [2880, 144, 1, 4096] - - [466, 0.0] - - - [2880, 168, 1, 512] - - [467, 0.0] - - - [2880, 200, 1, 4096] - - [468, 0.0] - - - [2880, 240, 1, 4096] - - [469, 0.0] - - - [2880, 304, 1, 512] - - [470, 0.0] - - - [2880, 368, 1, 512] - - [471, 0.0] - - - [2880, 416, 1, 4096] - - [472, 0.0] - - - [2880, 480, 1, 512] - - [473, 0.0] - - - [2880, 528, 1, 512] - - [474, 0.0] - - - [2880, 592, 1, 4096] - - [475, 0.0] - - - [2880, 656, 1, 512] - - [476, 0.0] - - - [2880, 720, 1, 512] - - [477, 0.0] - - - [2880, 784, 1, 512] - - [478, 0.0] - - - [2880, 848, 1, 512] - - [479, 0.0] - - - [2880, 912, 1, 512] - - [480, 0.0] - - - [2880, 976, 1, 512] - - [481, 0.0] - - - [2880, 1202, 1, 4096] - - [482, 0.0] - - - [2880, 4096, 1, 4096] - - [483, 0.0] - - - [2880, 7821, 1, 4096] - - [484, 0.0] - - - [2880, 8192, 1, 512] - - [485, 0.0] - - - [5120, 56, 1, 2880] - - [486, 0.0] - - - [5120, 112, 1, 2880] - - [487, 0.0] - - - [5120, 184, 1, 2880] - - [488, 0.0] - - - [5120, 256, 1, 2880] - - [489, 0.0] - - - [5120, 352, 1, 2880] - - [490, 0.0] - - - [5120, 480, 1, 2880] - - [491, 0.0] - - - [5120, 608, 1, 2880] - - [492, 0.0] - - - [5120, 736, 1, 2880] - - [493, 0.0] - - - [5120, 864, 1, 2880] - - [494, 0.0] - - - [5120, 1024, 1, 2880] - - [495, 0.0] - - - [5120, 7821, 1, 2880] - - [496, 0.0] - - - [25136, 8, 1, 2880] - - [497, 0.0] - - - [25136, 16, 1, 2880] - - [497, 0.0] - - - [25136, 31, 1, 2880] - - [498, 0.0] - - - [25136, 48, 1, 2880] - - [499, 0.0] - - - [25136, 63, 1, 2880] - - [500, 0.0] - - - [25136, 89, 1, 2880] - - [501, 0.0] - - - [201088, 12, 1, 2880] - - [502, 0.0] - - - [201088, 32, 1, 2880] - - [503, 0.0] - - - [201088, 63, 1, 2880] - - [504, 0.0] - - - [128, 80, 1, 2880] - - [505, 0.0] - - - [128, 120, 1, 2880] - - [506, 0.0] - - - [128, 168, 1, 2880] - - [507, 0.0] - - - [128, 224, 1, 2880] - - [508, 0.0] - - - [128, 336, 1, 2880] - - [509, 0.0] - - - [128, 432, 1, 2880] - - [510, 0.0] - - - [128, 464, 1, 2880] - - [511, 0.0] - - - [128, 560, 1, 2880] - - [512, 0.0] - - - [128, 608, 1, 2880] - - [513, 0.0] - - - [128, 640, 1, 2880] - - [514, 0.0] - - - [128, 960, 1, 2880] - - [515, 0.0] - - - [128, 3073, 1, 2880] - - [516, 0.0] - - - [128, 7177, 1, 2880] - - [517, 0.0] - - - [128, 7821, 1, 2880] - - [518, 0.0] - - - [128, 8189, 1, 2880] - - [519, 0.0] - - - [640, 72, 1, 2880] - - [520, 0.0] - - - [640, 80, 1, 2880] - - [521, 0.0] - - - [640, 184, 1, 2880] - - [522, 0.0] - - - [640, 200, 1, 2880] - - [523, 0.0] - - - [640, 432, 1, 2880] - - [524, 0.0] - - - [640, 544, 1, 2880] - - [525, 0.0] - - - [640, 672, 1, 2880] - - [526, 0.0] - - - [640, 816, 1, 2880] - - [527, 0.0] - - - [640, 944, 1, 2880] - - [528, 0.0] - - - [640, 2049, 1, 2880] - - [529, 0.0] - - - [640, 5790, 1, 2880] - - [530, 0.0] - - - [640, 8192, 1, 2880] - - [531, 0.0] - - - [2880, 24, 1, 4096] - - [532, 0.0] - - - [2880, 56, 1, 4096] - - [533, 0.0] - - - [2880, 88, 1, 4096] - - [534, 0.0] - - - [2880, 120, 1, 4096] - - [535, 0.0] - - - [2880, 152, 1, 4096] - - [536, 0.0] - - - [2880, 184, 1, 4096] - - [537, 0.0] - - - [2880, 216, 1, 4096] - - [538, 0.0] - - - [2880, 248, 1, 4096] - - [539, 0.0] - - - [2880, 304, 1, 4096] - - [540, 0.0] - - - [2880, 368, 1, 4096] - - [541, 0.0] - - - [2880, 432, 1, 512] - - [542, 0.0] - - - [2880, 464, 1, 512] - - [543, 0.0] - - - [2880, 544, 1, 4096] - - [544, 0.0] - - - [2880, 608, 1, 4096] - - [545, 0.0] - - - [2880, 672, 1, 4096] - - [546, 0.0] - - - [2880, 736, 1, 512] - - [547, 0.0] - - - [2880, 800, 1, 4096] - - [548, 0.0] - - - [2880, 864, 1, 4096] - - [549, 0.0] - - - [2880, 928, 1, 4096] - - [550, 0.0] - - - [2880, 992, 1, 4096] - - [551, 0.0] - - - [2880, 1881, 1, 512] - - [552, 0.0] - - - [2880, 6146, 1, 512] - - [553, 0.0] - - - [2880, 8183, 1, 512] - - [554, 0.0] - - - [5120, 8, 1, 2880] - - [555, 0.0] - - - [5120, 48, 1, 2880] - - [556, 0.0] - - - [5120, 144, 1, 2880] - - [557, 0.0] - - - [5120, 200, 1, 2880] - - [558, 0.0] - - - [5120, 272, 1, 2880] - - [559, 0.0] - - - [5120, 400, 1, 2880] - - [560, 0.0] - - - [5120, 512, 1, 2880] - - [561, 0.0] - - - [5120, 640, 1, 2880] - - [562, 0.0] - - - [5120, 752, 1, 2880] - - [563, 0.0] - - - [5120, 896, 1, 2880] - - [564, 0.0] - - - [5120, 992, 1, 2880] - - [565, 0.0] - - - [5120, 7177, 1, 2880] - - [566, 0.0] - - - [25136, 7, 1, 2880] - - [567, 0.0] - - - [25136, 17, 1, 2880] - - [568, 0.0] - - - [25136, 32, 1, 2880] - - [569, 0.0] - - - [25136, 49, 1, 2880] - - [570, 0.0] - - - [25136, 66, 1, 2880] - - [571, 0.0] - - - [25136, 88, 1, 2880] - - [572, 0.0] - - - [201088, 10, 1, 2880] - - [573, 0.0] - - - [201088, 15, 1, 2880] - - [574, 0.0] - - - [201088, 40, 1, 2880] - - [575, 0.0] - - - [201088, 71, 1, 2880] - - [576, 0.0] - - - [201088, 87, 1, 2880] - - [577, 0.0] - - - [128, 88, 1, 2880] - - [578, 0.0] - - - [128, 136, 1, 2880] - - [579, 0.0] - - - [128, 288, 1, 2880] - - [580, 0.0] - - - [128, 320, 1, 2880] - - [581, 0.0] - - - [128, 400, 1, 2880] - - [582, 0.0] - - - [128, 544, 1, 2880] - - [583, 0.0] - - - [128, 720, 1, 2880] - - [584, 0.0] - - - [128, 848, 1, 2880] - - [585, 0.0] - - - [128, 928, 1, 2880] - - [586, 0.0] - - - [128, 7169, 1, 2880] - - [587, 0.0] - - - [640, 24, 1, 2880] - - [588, 0.0] - - - [640, 112, 1, 2880] - - [589, 0.0] - - - [640, 144, 1, 2880] - - [590, 0.0] - - - [640, 168, 1, 2880] - - [591, 0.0] - - - [640, 480, 1, 2880] - - [592, 0.0] - - - [640, 592, 1, 2880] - - [593, 0.0] - - - [640, 720, 1, 2880] - - [594, 0.0] - - - [640, 864, 1, 2880] - - [595, 0.0] - - - [640, 992, 1, 2880] - - [596, 0.0] - - - [640, 7215, 1, 2880] - - [597, 0.0] - - - [2880, 8, 1, 512] - - [598, 0.0] - - - [2880, 32, 1, 4096] - - [599, 0.0] - - - [2880, 64, 1, 4096] - - [600, 0.0] - - - [2880, 96, 1, 4096] - - [601, 0.0] - - - [2880, 128, 1, 4096] - - [602, 0.0] - - - [2880, 160, 1, 4096] - - [603, 0.0] - - - [2880, 192, 1, 4096] - - [604, 0.0] - - - [2880, 224, 1, 4096] - - [605, 0.0] - - - [2880, 256, 1, 4096] - - [606, 0.0] - - - [2880, 320, 1, 4096] - - [607, 0.0] - - - [2880, 384, 1, 4096] - - [608, 0.0] - - - [2880, 448, 1, 4096] - - [609, 0.0] - - - [2880, 512, 1, 4096] - - [610, 0.0] - - - [2880, 576, 1, 4096] - - [611, 0.0] - - - [2880, 640, 1, 4096] - - [612, 0.0] - - - [2880, 704, 1, 4096] - - [613, 0.0] - - - [2880, 768, 1, 4096] - - [614, 0.0] - - - [2880, 832, 1, 4096] - - [615, 0.0] - - - [2880, 896, 1, 4096] - - [616, 0.0] - - - [2880, 960, 1, 4096] - - [617, 0.0] - - - [2880, 1024, 1, 512] - - [618, 0.0] - - - [2880, 3073, 1, 512] - - [619, 0.0] - - - [2880, 7209, 1, 4096] - - [620, 0.0] - - - [2880, 8189, 1, 4096] - - [621, 0.0] - - - [5120, 40, 1, 2880] - - [622, 0.0] - - - [5120, 104, 1, 2880] - - [623, 0.0] - - - [5120, 168, 1, 2880] - - [624, 0.0] - - - [5120, 232, 1, 2880] - - [625, 0.0] - - - [5120, 336, 1, 2880] - - [626, 0.0] - - - [5120, 464, 1, 2880] - - [627, 0.0] - - - [5120, 592, 1, 2880] - - [628, 0.0] - - - [5120, 704, 1, 2880] - - [629, 0.0] - - - [5120, 816, 1, 2880] - - [630, 0.0] - - - [5120, 976, 1, 2880] - - [631, 0.0] - - - [5120, 1881, 1, 2880] - - [632, 0.0] - - - [5120, 8183, 1, 2880] - - [633, 0.0] - - - [25136, 9, 1, 2880] - - [634, 0.0] - - - [25136, 19, 1, 2880] - - [635, 0.0] - - - [25136, 33, 1, 2880] - - [636, 0.0] - - - [25136, 54, 1, 2880] - - [637, 0.0] - - - [25136, 74, 1, 2880] - - [638, 0.0] - - - [201088, 7, 1, 2880] - - [639, 0.0] - - - [201088, 23, 1, 2880] - - [640, 0.0] - - - [201088, 48, 1, 2880] - - [641, 0.0] - - - [201088, 79, 1, 2880] - - [642, 0.0] - - - [128, 104, 1, 2880] - - [643, 0.0] - - - [128, 160, 1, 2880] - - [644, 0.0] - - - [128, 200, 1, 2880] - - [645, 0.0] - - - [128, 384, 1, 2880] - - [646, 0.0] - - - [128, 448, 1, 2880] - - [647, 0.0] - - - [128, 656, 1, 2880] - - [648, 0.0] - - - [128, 992, 1, 2880] - - [649, 0.0] - - - [128, 7209, 1, 2880] - - [650, 0.0] - - - [640, 40, 1, 2880] - - [651, 0.0] - - - [640, 152, 1, 2880] - - [652, 0.0] - - - [640, 248, 1, 2880] - - [653, 0.0] - - - [640, 304, 1, 2880] - - [654, 0.0] - - - [640, 336, 1, 2880] - - [655, 0.0] - - - [640, 448, 1, 2880] - - [656, 0.0] - - - [640, 576, 1, 2880] - - [657, 0.0] - - - [640, 704, 1, 2880] - - [658, 0.0] - - - [640, 832, 1, 2880] - - [659, 0.0] - - - [640, 976, 1, 2880] - - [660, 0.0] - - - [640, 4096, 1, 2880] - - [661, 0.0] - - - [2880, 8, 1, 4096] - - [662, 0.0] - - - [2880, 40, 1, 4096] - - [663, 0.0] - - - [2880, 80, 1, 512] - - [664, 0.0] - - - [2880, 104, 1, 512] - - [665, 0.0] - - - [2880, 144, 1, 512] - - [666, 0.0] - - - [2880, 176, 1, 512] - - [667, 0.0] - - - [2880, 216, 1, 512] - - [668, 0.0] - - - [2880, 248, 1, 512] - - [669, 0.0] - - - [2880, 288, 1, 512] - - [670, 0.0] - - - [2880, 352, 1, 512] - - [671, 0.0] - - - [2880, 432, 1, 4096] - - [672, 0.0] - - - [2880, 480, 1, 4096] - - [673, 0.0] - - - [2880, 544, 1, 512] - - [674, 0.0] - - - [2880, 592, 1, 512] - - [675, 0.0] - - - [2880, 672, 1, 512] - - [676, 0.0] - - - [2880, 752, 1, 512] - - [677, 0.0] - - - [2880, 816, 1, 512] - - [678, 0.0] - - - [2880, 848, 1, 4096] - - [679, 0.0] - - - [2880, 928, 1, 512] - - [680, 0.0] - - - [2880, 992, 1, 512] - - [681, 0.0] - - - [2880, 1881, 1, 4096] - - [682, 0.0] - - - [2880, 5790, 1, 512] - - [683, 0.0] - - - [2880, 7215, 1, 512] - - [684, 0.0] - - - [2880, 8192, 1, 4096] - - [685, 0.0] - - - [5120, 64, 1, 2880] - - [686, 0.0] - - - [5120, 128, 1, 2880] - - [687, 0.0] - - - [5120, 192, 1, 2880] - - [688, 0.0] - - - [5120, 248, 1, 2880] - - [689, 0.0] - - - [5120, 384, 1, 2880] - - [690, 0.0] - - - [5120, 528, 1, 2880] - - [691, 0.0] - - - [5120, 656, 1, 2880] - - [692, 0.0] - - - [5120, 768, 1, 2880] - - [693, 0.0] - - - [5120, 880, 1, 2880] - - [694, 0.0] - - - [5120, 1008, 1, 2880] - - [695, 0.0] - - - [5120, 7209, 1, 2880] - - [696, 0.0] - - - [25136, 6, 1, 2880] - - [697, 0.0] - - - [25136, 14, 1, 2880] - - [698, 0.0] - - - [25136, 27, 1, 2880] - - [699, 0.0] - - - [25136, 44, 1, 2880] - - [700, 0.0] - - - [25136, 60, 1, 2880] - - [701, 0.0] - - - [25136, 128, 1, 2880] - - [702, 0.0] - - - [201088, 11, 1, 2880] - - [703, 0.0] - - - [201088, 31, 1, 2880] - - [704, 0.0] - - - [201088, 58, 1, 2880] - - [705, 0.0] - - - [201088, 128, 1, 2880] - - [706, 0.0] - - - [201088, 88, 1, 2880] - - [706, 0.0] - - - [201088, 89, 1, 2880] - - [706, 0.0] - - - [128, 96, 1, 2880] - - [707, 0.0] - - - [128, 112, 1, 2880] - - [708, 0.0] - - - [128, 152, 1, 2880] - - [709, 0.0] - - - [128, 216, 1, 2880] - - [710, 0.0] - - - [128, 272, 1, 2880] - - [711, 0.0] - - - [128, 512, 1, 2880] - - [712, 0.0] - - - [128, 704, 1, 2880] - - [713, 0.0] - - - [128, 784, 1, 2880] - - [714, 0.0] - - - [128, 816, 1, 2880] - - [715, 0.0] - - - [128, 976, 1, 2880] - - [716, 0.0] - - - [128, 1202, 1, 2880] - - [717, 0.0] - - - [128, 7215, 1, 2880] - - [718, 0.0] - - - [640, 56, 1, 2880] - - [719, 0.0] - - - [640, 88, 1, 2880] - - [720, 0.0] - - - [640, 120, 1, 2880] - - [721, 0.0] - - - [640, 216, 1, 2880] - - [722, 0.0] - - - [640, 240, 1, 2880] - - [723, 0.0] - - - [640, 464, 1, 2880] - - [724, 0.0] - - - [640, 608, 1, 2880] - - [725, 0.0] - - - [640, 736, 1, 2880] - - [726, 0.0] - - - [640, 848, 1, 2880] - - [727, 0.0] - - - [640, 960, 1, 2880] - - [728, 0.0] - - - [640, 3073, 1, 2880] - - [729, 0.0] - - - [640, 8184, 1, 2880] - - [730, 0.0] - - - [2880, 32, 1, 512] - - [731, 0.0] - - - [2880, 64, 1, 512] - - [732, 0.0] - - - [2880, 96, 1, 512] - - [733, 0.0] - - - [2880, 128, 1, 512] - - [734, 0.0] - - - [2880, 160, 1, 512] - - [735, 0.0] - - - [2880, 192, 1, 512] - - [736, 0.0] - - - [2880, 224, 1, 512] - - [737, 0.0] - - - [2880, 256, 1, 512] - - [738, 0.0] - - - [2880, 320, 1, 512] - - [739, 0.0] - - - [2880, 384, 1, 512] - - [740, 0.0] - - - [2880, 448, 1, 512] - - [741, 0.0] - - - [2880, 512, 1, 512] - - [742, 0.0] - - - [2880, 576, 1, 512] - - [743, 0.0] - - - [2880, 640, 1, 512] - - [744, 0.0] - - - [2880, 704, 1, 512] - - [745, 0.0] - - - [2880, 752, 1, 4096] - - [746, 0.0] - - - [2880, 832, 1, 512] - - [747, 0.0] - - - [2880, 880, 1, 4096] - - [748, 0.0] - - - [2880, 944, 1, 4096] - - [749, 0.0] - - - [2880, 1008, 1, 4096] - - [750, 0.0] - - - [2880, 2050, 1, 4096] - - [751, 0.0] - - - [2880, 7177, 1, 4096] - - [752, 0.0] - - - [2880, 8184, 1, 512] - - [753, 0.0] - - - [5120, 24, 1, 2880] - - [754, 0.0] - - - [5120, 88, 1, 2880] - - [755, 0.0] - - - [5120, 152, 1, 2880] - - [756, 0.0] - - - [5120, 216, 1, 2880] - - [757, 0.0] - - - [5120, 320, 1, 2880] - - [758, 0.0] - - - [5120, 448, 1, 2880] - - [759, 0.0] - - - [5120, 576, 1, 2880] - - [760, 0.0] - - - [5120, 720, 1, 2880] - - [761, 0.0] - - - [5120, 848, 1, 2880] - - [762, 0.0] - - - [5120, 944, 1, 2880] - - [763, 0.0] - - - [5120, 7169, 1, 2880] - - [764, 0.0] - - - [5120, 8192, 1, 2880] - - [765, 0.0] - - - [25136, 12, 1, 2880] - - [766, 0.0] - - - [25136, 22, 1, 2880] - - [767, 0.0] - - - [25136, 38, 1, 2880] - - [768, 0.0] - - - [25136, 57, 1, 2880] - - [769, 0.0] - - - [25136, 69, 1, 2880] - - [770, 0.0] - - - [201088, 9, 1, 2880] - - [771, 0.0] - - - [201088, 24, 1, 2880] - - [772, 0.0] - - - [201088, 42, 1, 2880] - - [773, 0.0] - - - [201088, 72, 1, 2880] - - [774, 0.0] - - - [20, 2048, 1, 3840] - - [775, 0.0] - - - [20, 10240, 1, 3840] - - [776, 0.0] - - - [20, 18432, 1, 3840] - - [777, 0.0] - - - [20, 34816, 1, 3840] - - [778, 0.0] - - - [32, 2048, 1, 2048] - - [779, 0.0] - - - [32, 2048, 1, 4096] - - [780, 0.0] - - - [32, 4096, 1, 1024] - - [781, 0.0] - - - [32, 4096, 1, 1536] - - [782, 0.0] - - - [32, 4096, 1, 4096] - - [783, 0.0] - - - [32, 6144, 1, 768] - - [784, 0.0] - - - [64, 40960, 1, 2048] - - [785, 0.0] - - - [256, 1536, 1, 2048] - - [786, 0.0] - - - [304, 12288, 1, 512] - - [787, 0.0] - - - [512, 12288, 1, 304] - - [788, 0.0] - - - [512, 12288, 1, 512] - - [789, 0.0] - - - [512, 12288, 1, 2048] - - [790, 0.0] - - - [512, 32768, 1, 4096] - - [791, 0.0] - - - [576, 16, 1, 576] - - [792, 0.0] - - - [576, 16, 1, 2304] - - [793, 0.0] - - - [576, 264, 1, 576] - - [794, 0.0] - - - [576, 264, 1, 2304] - - [795, 0.0] - - - [576, 2048, 1, 576] - - [796, 0.0] - - - [576, 2048, 1, 2304] - - [797, 0.0] - - - [576, 2048, 1, 3840] - - [798, 0.0] - - - [576, 12288, 1, 576] - - [799, 0.0] - - - [576, 12288, 1, 3840] - - [800, 0.0] - - - [1024, 8, 1, 512] - - [801, 0.0] - - - [1024, 8, 1, 1024] - - [802, 0.0] - - - [1152, 2048, 1, 576] - - [803, 0.0] - - - [1440, 1440, 1, 384] - - [804, 0.0] - - - [1728, 264, 1, 576] - - [805, 0.0] - - - [1728, 2048, 1, 576] - - [806, 0.0] - - - [1728, 12288, 1, 576] - - [807, 0.0] - - - [2048, 2056, 1, 2048] - - [808, 0.0] - - - [2048, 2056, 1, 3840] - - [809, 0.0] - - - [2048, 2056, 1, 8192] - - [810, 0.0] - - - [2048, 12288, 1, 512] - - [811, 0.0] - - - [2048, 12288, 1, 1536] - - [812, 0.0] - - - [2304, 16, 1, 576] - - [813, 0.0] - - - [2304, 264, 1, 576] - - [814, 0.0] - - - [2304, 2048, 1, 576] - - [815, 0.0] - - - [2816, 30720, 1, 2048] - - [816, 0.0] - - - [3840, 256, 1, 3840] - - [817, 0.0] - - - [3840, 512, 1, 3840] - - [818, 0.0] - - - [3840, 768, 1, 3840] - - [819, 0.0] - - - [3840, 1024, 1, 3840] - - [820, 0.0] - - - [3840, 1280, 1, 3840] - - [821, 0.0] - - - [3840, 1536, 1, 3840] - - [822, 0.0] - - - [3840, 1792, 1, 3840] - - [823, 0.0] - - - [3840, 2048, 1, 576] - - [824, 0.0] - - - [3840, 2048, 1, 3840] - - [825, 0.0] - - - [3840, 2056, 1, 2048] - - [826, 0.0] - - - [3840, 2560, 1, 3840] - - [827, 0.0] - - - [3840, 2816, 1, 3840] - - [828, 0.0] - - - [3840, 5632, 1, 3840] - - [829, 0.0] - - - [4096, 512, 1, 10240] - - [830, 0.0] - - - [5120, 512, 1, 5120] - - [1190, 0.0] - - - [5120, 520, 1, 5120] - - [831, 0.0] - - - [5120, 5760, 1, 64] - - [832, 0.0] - - - [5120, 11520, 1, 64] - - [833, 0.0] - - - [6240, 6240, 1, 384] - - [834, 0.0] - - - [8192, 2056, 1, 2048] - - [835, 0.0] - - - [100096, 40960, 1, 2048] - - [836, 0.0] - - - [1792, 2896, 1, 512] - - [837, 0.0] - - - [2, 6577472, 1, 9] - - [838, 0.0] - - - [2048, 1280, 1, 68032] - - [954, 0.0] - - - [256, 1280, 1, 384] - - [839, 0.0] - - - [256, 66960, 1, 1024] - - [840, 0.0] - - - [256, 114560, 1, 512] - - [841, 0.0] - - - [256, 136080, 1, 1920] - - [842, 0.0] - - - [256, 194160, 1, 1024] - - [843, 0.0] - - - [384, 1280, 1, 384] - - [844, 0.0] - - - [120, 1024, 1, 96] - - [845, 0.0] - - - [384, 196608, 1, 648] - - [846, 0.0] - - - [98304, 1024, 1, 9600] - - [847, 0.0] - - - [128, 3072, 1, 24] - - [848, 0.0] - - - [160, 3072, 1, 512] - - [849, 0.0] - - - [160, 3072, 1, 888] - - [1259, 0.0] - - - [256, 3072, 1, 24] - - [850, 0.0] - - - [316, 3072, 1, 256] - - [851, 0.0] - - - [320, 3072, 1, 24] - - [852, 0.0] - - - [512, 3072, 1, 32] - - [853, 0.0] - - - [512, 3072, 1, 64] - - [854, 0.0] - - - [512, 3072, 1, 96] - - [855, 0.0] - - - [512, 3072, 1, 128] - - [856, 0.0] - - - [512, 3072, 1, 192] - - [857, 0.0] - - - [512, 3072, 1, 1880] - - [858, 0.0] - - - [2048, 3072, 1, 4480] - - [859, 0.0] - - - [3840, 3072, 1, 2048] - - [860, 0.0] - - - [4480, 3072, 1, 768] - - [861, 0.0] - - - [32, 2048, 1, 64] - - [862, 0.0] - - - [32, 2048, 1, 128] - - [863, 0.0] - - - [64, 2048, 1, 192] - - [864, 0.0] - - - [128, 192, 1, 128] - - [865, 0.0] - - - [128, 2048, 1, 9984] - - [1280, 0.0] - - - [128, 93614, 1, 256] - - [1281, 0.0] - - - [128, 155468, 1, 128] - - [866, 0.0] - - - [128, 184146, 1, 128] - - [1282, 0.0] - - - [192, 192, 1, 192] - - [867, 0.0] - - - [192, 2048, 1, 512] - - [868, 0.0] - - - [192, 88177, 1, 832] - - [1283, 0.0] - - - [192, 110122, 1, 640] - - [869, 0.0] - - - [192, 207816, 1, 256] - - [870, 0.0] - - - [192, 225878, 1, 192] - - [871, 0.0] - - - [192, 262144, 1, 448] - - [872, 0.0] - - - [192, 385620, 1, 384] - - [873, 0.0] - - - [192, 391699, 1, 192] - - [1287, 0.0] - - - [192, 393216, 1, 448] - - [874, 0.0] - - - [192, 524288, 1, 448] - - [1288, 0.0] - - - [192, 559568, 1, 192] - - [1289, 0.0] - - - [192, 690621, 1, 192] - - [1290, 0.0] - - - [256, 192, 1, 256] - - [875, 0.0] - - - [256, 114688, 1, 128] - - [1292, 0.0] - - - [256, 131072, 1, 256] - - [876, 0.0] - - - [256, 262144, 1, 512] - - [877, 0.0] - - - [256, 384937, 1, 832] - - [1295, 0.0] - - - [256, 428780, 1, 448] - - [878, 0.0] - - - [256, 870382, 1, 256] - - [879, 0.0] - - - [256, 874159, 1, 256] - - [880, 0.0] - - - [384, 114688, 1, 128] - - [1296, 0.0] - - - [504, 262144, 1, 512] - - [881, 0.0] - - - [504, 262144, 1, 1414] - - [882, 0.0] - - - [512, 2048, 1, 512] - - [883, 0.0] - - - [512, 2048, 1, 1024] - - [884, 0.0] - - - [512, 2048, 1, 7168] - - [1297, 0.0] - - - [512, 114688, 1, 128] - - [885, 0.0] - - - [512, 262144, 1, 512] - - [886, 0.0] - - - [1024, 2048, 1, 2048] - - [887, 0.0] - - - [1024, 2048, 1, 7168] - - [1301, 0.0] - - - [4096, 4096, 1, 36864] - - [888, 0.0] - - - [9984, 2048, 1, 1024] - - [889, 0.0] - - - [15964, 2048, 1, 256] - - [1307, 0.0] - - - [16384, 2048, 1, 256] - - [890, 0.0] - - - [32768, 2048, 1, 9984] - - [891, 0.0] - - - [131456, 2048, 1, 1024] - - [892, 0.0] - - - [6, 6, 2048, 512] - - [893, 0.0] - - - [6, 6, 2048, 1024] - - [894, 0.0] - - - [4096, 4096, 1, 65728] - - [895, 0.0] - - - [6144, 1024, 1, 6144] - - [896, 0.0] - - - [24576, 1024, 1, 6144] - - [897, 0.0] - - - [6144, 1024, 1, 24576] - - [898, 0.0] - - - [1357, 1024, 48, 128] - - [899, 0.0] - - - [1024, 1024, 48, 128] - - [900, 0.0] - - - [256, 256, 64, 64] - - [901, 0.0] - - - [1024, 1024, 6, 64] - - [902, 0.0] - - - [6144, 65536, 1, 1024] - - [903, 0.0] - - - [6144, 65536, 1, 8] - - [904, 0.0] - - - [64, 2048, 1, 3072] - - [905, 0.0] - - - [512, 65536, 1, 512] - - [906, 0.0] - - - [768, 77, 1, 768] - - [907, 0.0] - - - [768, 77, 1, 3072] - - [908, 0.0] - - - [3072, 32, 1, 3072] - - [909, 0.0] - - - [3072, 32, 1, 4096] - - [910, 0.0] - - - [3072, 32, 1, 12288] - - [911, 0.0] - - - [3072, 77, 1, 768] - - [912, 0.0] - - - [3072, 2048, 1, 64] - - [913, 0.0] - - - [3072, 2048, 1, 3072] - - [914, 0.0] - - - [3072, 2048, 1, 12288] - - [915, 0.0] - - - [3072, 2080, 1, 3072] - - [916, 0.0] - - - [3072, 2080, 1, 15360] - - [917, 0.0] - - - [4096, 256, 1, 4096] - - [918, 0.0] - - - [4096, 256, 1, 10240] - - [919, 0.0] - - - [10240, 256, 1, 4096] - - [920, 0.0] - - - [12288, 32, 1, 3072] - - [921, 0.0] - - - [12288, 2048, 1, 3072] - - [922, 0.0] - - - [12288, 2080, 1, 3072] - - [923, 0.0] - - - [3072, 1, 1, 256] - - [924, 0.0] - - - [3072, 1, 1, 768] - - [925, 0.0] - - - [3072, 1, 1, 3072] - - [926, 0.0] - - - [6144, 1, 1, 3072] - - [927, 0.0] - - - [9216, 1, 1, 3072] - - [928, 0.0] - - - [18432, 1, 1, 3072] - - [929, 0.0] - - - [3840, 2304, 1, 3840] - - [930, 0.0] - - - [128, 18928, 1, 128] - - [1203, 0.0] - - - [128, 32768, 1, 128] - - [931, 0.0] - - - [128, 2119936, 1, 128] - - [932, 0.0] - - - [128, 3670016, 1, 128] - - [933, 0.0] - - - [134, 16800000, 1, 128] - - [934, 0.0] - - - [512, 18928, 1, 128] - - [935, 0.0] - - - [512, 32768, 1, 128] - - [936, 0.0] - - - [1024, 150000, 1, 1024] - - [937, 0.0] - - - [2048, 150000, 1, 1134] - - [938, 0.0] - - - [4096, 150000, 1, 1024] - - [939, 0.0] - - - [4096, 150000, 1, 2268] - - [940, 0.0] - - - [1280, 257, 1, 1280] - - [1132, 0.0] - - - [4096, 512, 1, 4096] - - [1188, 0.0] - - - [5120, 257, 1, 1280] - - [1134, 0.0] - - - [5120, 257, 1, 5120] - - [1135, 0.0] - - - [5120, 512, 1, 4096] - - [1189, 0.0] - - - [5120, 13640, 1, 5120] - - [955, 0.0] - - - [5120, 13640, 1, 13824] - - [956, 0.0] - - - [10240, 512, 1, 4096] - - [1200, 0.0] - - - [512, 512, 64, 64] - - [957, 0.0] - - - [1024, 128, 1, 4096] - - [958, 0.0] - - - [1024, 256, 1, 7168] - - [959, 0.0] - - - [1024, 512, 1, 7168] - - [960, 0.0] - - - [1024, 576, 1, 7168] - - [961, 0.0] - - - [1024, 1024, 1, 7168] - - [962, 0.0] - - - [1024, 1280, 1, 8192] - - [963, 0.0] - - - [1024, 1536, 1, 7168] - - [964, 0.0] - - - [1024, 2304, 1, 4096] - - [965, 0.0] - - - [1024, 2304, 1, 16384] - - [966, 0.0] - - - [1024, 2560, 1, 5120] - - [967, 0.0] - - - [1024, 2560, 1, 8192] - - [968, 0.0] - - - [1024, 3072, 1, 1536] - - [969, 0.0] - - - [1024, 4096, 1, 512] - - [970, 0.0] - - - [1024, 4096, 1, 1024] - - [971, 0.0] - - - [1024, 4096, 1, 2048] - - [972, 0.0] - - - [1024, 4096, 1, 7168] - - [973, 0.0] - - - [1024, 4096, 1, 8192] - - [974, 0.0] - - - [1024, 4608, 1, 7168] - - [975, 0.0] - - - [1024, 4608, 1, 16384] - - [976, 0.0] - - - [1024, 5120, 1, 1024] - - [977, 0.0] - - - [1024, 5120, 1, 2048] - - [978, 0.0] - - - [1024, 5120, 1, 3200] - - [979, 0.0] - - - [1024, 5120, 1, 6400] - - [980, 0.0] - - - [1024, 5120, 1, 8192] - - [981, 0.0] - - - [1024, 5120, 1, 25600] - - [982, 0.0] - - - [1024, 6144, 1, 1536] - - [983, 0.0] - - - [1024, 6400, 1, 5120] - - [984, 0.0] - - - [1024, 7168, 1, 256] - - [985, 0.0] - - - [1024, 7168, 1, 512] - - [986, 0.0] - - - [1024, 7168, 1, 2048] - - [987, 0.0] - - - [1024, 7168, 1, 2304] - - [988, 0.0] - - - [1024, 7168, 1, 4096] - - [989, 0.0] - - - [1024, 7168, 1, 4608] - - [990, 0.0] - - - [1024, 7168, 1, 8192] - - [991, 0.0] - - - [1024, 7168, 1, 16384] - - [992, 0.0] - - - [1024, 7168, 1, 18432] - - [993, 0.0] - - - [1024, 8192, 1, 512] - - [994, 0.0] - - - [1024, 8192, 1, 1024] - - [995, 0.0] - - - [1024, 8192, 1, 2048] - - [996, 0.0] - - - [1024, 8192, 1, 3584] - - [997, 0.0] - - - [1024, 8192, 1, 7168] - - [998, 0.0] - - - [1024, 8192, 1, 8192] - - [999, 0.0] - - - [1024, 8192, 1, 28672] - - [1000, 0.0] - - - [1024, 9216, 1, 4096] - - [1001, 0.0] - - - [1024, 9216, 1, 7168] - - [1002, 0.0] - - - [1024, 10240, 1, 5120] - - [1003, 0.0] - - - [1024, 10240, 1, 8192] - - [1004, 0.0] - - - [1024, 12800, 1, 5120] - - [1005, 0.0] - - - [1024, 13312, 1, 16384] - - [1006, 0.0] - - - [1024, 14336, 1, 8192] - - [1007, 0.0] - - - [1024, 16384, 1, 4096] - - [1008, 0.0] - - - [1024, 16384, 1, 6656] - - [1009, 0.0] - - - [1024, 16384, 1, 13312] - - [1010, 0.0] - - - [1024, 16384, 1, 16384] - - [1011, 0.0] - - - [1024, 16384, 1, 53248] - - [1012, 0.0] - - - [1024, 18432, 1, 16384] - - [1013, 0.0] - - - [1024, 24576, 1, 1536] - - [1014, 0.0] - - - [1024, 26624, 1, 16384] - - [1015, 0.0] - - - [1024, 36864, 1, 7168] - - [1016, 0.0] - - - [1024, 51200, 1, 5120] - - [1017, 0.0] - - - [1024, 57344, 1, 8192] - - [1018, 0.0] - - - [1024, 106496, 1, 16384] - - [1019, 0.0] - - - [2048, 256, 1, 7168] - - [1020, 0.0] - - - [2048, 512, 1, 7168] - - [1021, 0.0] - - - [2048, 576, 1, 7168] - - [1022, 0.0] - - - [2048, 1024, 1, 7168] - - [1023, 0.0] - - - [2048, 1152, 1, 4096] - - [1024, 0.0] - - - [2048, 1280, 1, 5120] - - [1025, 0.0] - - - [2048, 1280, 1, 8192] - - [1026, 0.0] - - - [2048, 1536, 1, 7168] - - [1027, 0.0] - - - [2048, 2304, 1, 4096] - - [1028, 0.0] - - - [2048, 2304, 1, 16384] - - [1029, 0.0] - - - [2048, 2560, 1, 5120] - - [1030, 0.0] - - - [2048, 2560, 1, 8192] - - [1031, 0.0] - - - [2048, 4096, 1, 512] - - [1032, 0.0] - - - [2048, 4096, 1, 1024] - - [1033, 0.0] - - - [2048, 4096, 1, 2048] - - [1034, 0.0] - - - [2048, 4096, 1, 7168] - - [1035, 0.0] - - - [2048, 4096, 1, 8192] - - [1036, 0.0] - - - [2048, 4608, 1, 7168] - - [1037, 0.0] - - - [2048, 4608, 1, 16384] - - [1038, 0.0] - - - [2048, 5120, 1, 1024] - - [1039, 0.0] - - - [2048, 5120, 1, 2048] - - [1040, 0.0] - - - [2048, 5120, 1, 3200] - - [1041, 0.0] - - - [2048, 5120, 1, 6400] - - [1042, 0.0] - - - [2048, 5120, 1, 8192] - - [1043, 0.0] - - - [2048, 5120, 1, 25600] - - [1044, 0.0] - - - [2048, 6144, 1, 1536] - - [1045, 0.0] - - - [2048, 6400, 1, 5120] - - [1046, 0.0] - - - [2048, 7168, 1, 256] - - [1047, 0.0] - - - [2048, 7168, 1, 512] - - [1048, 0.0] - - - [2048, 7168, 1, 2048] - - [1049, 0.0] - - - [2048, 7168, 1, 8192] - - [1050, 0.0] - - - [2048, 7168, 1, 16384] - - [1051, 0.0] - - - [2048, 7168, 1, 18432] - - [1052, 0.0] - - - [2048, 8192, 1, 3584] - - [1053, 0.0] - - - [2048, 8192, 1, 7168] - - [1054, 0.0] - - - [2048, 8192, 1, 8192] - - [1055, 0.0] - - - [2048, 8192, 1, 28672] - - [1056, 0.0] - - - [2048, 9216, 1, 4096] - - [1057, 0.0] - - - [2048, 9216, 1, 7168] - - [1058, 0.0] - - - [2048, 10240, 1, 5120] - - [1059, 0.0] - - - [2048, 10240, 1, 8192] - - [1060, 0.0] - - - [2048, 12800, 1, 5120] - - [1061, 0.0] - - - [2048, 13312, 1, 16384] - - [1062, 0.0] - - - [2048, 14336, 1, 8192] - - [1063, 0.0] - - - [2048, 16384, 1, 2048] - - [1064, 0.0] - - - [2048, 16384, 1, 4096] - - [1065, 0.0] - - - [2048, 16384, 1, 6656] - - [1066, 0.0] - - - [2048, 16384, 1, 13312] - - [1067, 0.0] - - - [2048, 16384, 1, 16384] - - [1068, 0.0] - - - [2048, 16384, 1, 53248] - - [1069, 0.0] - - - [2048, 18432, 1, 16384] - - [1070, 0.0] - - - [2048, 24576, 1, 1536] - - [1071, 0.0] - - - [2048, 26624, 1, 16384] - - [1072, 0.0] - - - [2048, 36864, 1, 7168] - - [1073, 0.0] - - - [2048, 51200, 1, 5120] - - [1074, 0.0] - - - [2048, 106496, 1, 16384] - - [1075, 0.0] - - - [4096, 128, 1, 4096] - - [1076, 0.0] - - - [4096, 256, 1, 7168] - - [1077, 0.0] - - - [4096, 512, 1, 7168] - - [1078, 0.0] - - - [4096, 576, 1, 7168] - - [1079, 0.0] - - - [4096, 1024, 1, 7168] - - [1080, 0.0] - - - [4096, 1152, 1, 4096] - - [1081, 0.0] - - - [4096, 1280, 1, 5120] - - [1082, 0.0] - - - [4096, 1280, 1, 8192] - - [1083, 0.0] - - - [4096, 1536, 1, 7168] - - [1084, 0.0] - - - [4096, 2304, 1, 4096] - - [1085, 0.0] - - - [4096, 2304, 1, 16384] - - [1086, 0.0] - - - [4096, 2560, 1, 8192] - - [1087, 0.0] - - - [4096, 3072, 1, 1536] - - [1088, 0.0] - - - [4096, 4096, 1, 1024] - - [1089, 0.0] - - - [4096, 4096, 1, 7168] - - [1090, 0.0] - - - [4096, 4096, 1, 8192] - - [1091, 0.0] - - - [4096, 4608, 1, 7168] - - [1092, 0.0] - - - [4096, 4608, 1, 16384] - - [1093, 0.0] - - - [4096, 5120, 1, 1024] - - [1094, 0.0] - - - [4096, 5120, 1, 2048] - - [1095, 0.0] - - - [4096, 5120, 1, 3200] - - [1096, 0.0] - - - [4096, 5120, 1, 6400] - - [1097, 0.0] - - - [4096, 5120, 1, 8192] - - [1098, 0.0] - - - [4096, 5120, 1, 25600] - - [1099, 0.0] - - - [4096, 6144, 1, 1536] - - [1100, 0.0] - - - [4096, 6400, 1, 5120] - - [1101, 0.0] - - - [4096, 7168, 1, 512] - - [1102, 0.0] - - - [4096, 7168, 1, 2048] - - [1103, 0.0] - - - [4096, 7168, 1, 2304] - - [1104, 0.0] - - - [4096, 7168, 1, 4096] - - [1105, 0.0] - - - [4096, 7168, 1, 4608] - - [1106, 0.0] - - - [4096, 7168, 1, 8192] - - [1107, 0.0] - - - [4096, 7168, 1, 16384] - - [1108, 0.0] - - - [4096, 7168, 1, 18432] - - [1109, 0.0] - - - [4096, 8192, 1, 3584] - - [1110, 0.0] - - - [4096, 8192, 1, 7168] - - [1111, 0.0] - - - [4096, 8192, 1, 8192] - - [1112, 0.0] - - - [4096, 8192, 1, 28672] - - [1113, 0.0] - - - [4096, 9216, 1, 4096] - - [1114, 0.0] - - - [4096, 9216, 1, 7168] - - [1115, 0.0] - - - [4096, 10240, 1, 5120] - - [1116, 0.0] - - - [4096, 10240, 1, 8192] - - [1117, 0.0] - - - [4096, 12800, 1, 5120] - - [1118, 0.0] - - - [4096, 13312, 1, 16384] - - [1119, 0.0] - - - [4096, 14336, 1, 8192] - - [1120, 0.0] - - - [4096, 16384, 1, 2048] - - [1121, 0.0] - - - [4096, 16384, 1, 4096] - - [1122, 0.0] - - - [4096, 16384, 1, 6656] - - [1123, 0.0] - - - [4096, 16384, 1, 16384] - - [1124, 0.0] - - - [4096, 16384, 1, 53248] - - [1125, 0.0] - - - [4096, 18432, 1, 16384] - - [1126, 0.0] - - - [4096, 24576, 1, 1536] - - [1127, 0.0] - - - [4096, 26624, 1, 16384] - - [1128, 0.0] - - - [4096, 51200, 1, 5120] - - [1129, 0.0] - - - [4096, 106496, 1, 16384] - - [1130, 0.0] - - - [8192, 8192, 1, 37888] - - [1131, 0.0] - - - [1280, 264, 1, 1280] - - [1133, 0.0] - - - [5120, 264, 1, 1280] - - [1136, 0.0] - - - [5120, 264, 1, 5120] - - [1137, 0.0] - - - [5120, 9419, 1, 5120] - - [1138, 0.0] - - - [5120, 9419, 1, 13824] - - [1139, 0.0] - - - [5120, 9424, 1, 13824] - - [1140, 0.0] - - - [5120, 18389, 1, 5120] - - [1141, 0.0] - - - [5120, 18392, 1, 5120] - - [1142, 0.0] - - - [5120, 21090, 1, 5120] - - [1195, 0.0] - - - [5120, 21090, 1, 13824] - - [1196, 0.0] - - - [5120, 21096, 1, 5120] - - [1143, 0.0] - - - [5120, 21096, 1, 13824] - - [1144, 0.0] - - - [13824, 9419, 1, 5120] - - [1145, 0.0] - - - [13824, 9424, 1, 5120] - - [1146, 0.0] - - - [13824, 18389, 1, 5120] - - [1147, 0.0] - - - [13824, 18392, 1, 5120] - - [1148, 0.0] - - - [13824, 21090, 1, 5120] - - [1149, 0.0] - - - [13824, 21096, 1, 5120] - - [1150, 0.0] - - - [3072, 256, 1, 3072] - - [1151, 0.0] - - - [3072, 17, 1, 3072] - - [1152, 0.0] - - - [3072, 33677, 1, 15360] - - [1153, 0.0] - - - [3072, 33660, 1, 12288] - - [1154, 0.0] - - - [12288, 33677, 1, 3072] - - [1155, 0.0] - - - [12288, 33660, 1, 3072] - - [1156, 0.0] - - - [3072, 14867, 1, 3072] - - [1157, 0.0] - - - [3072, 14867, 1, 15360] - - [1158, 0.0] - - - [12288, 14867, 1, 3072] - - [1159, 0.0] - - - [3072, 14850, 1, 12288] - - [1160, 0.0] - - - [3072, 33660, 1, 3072] - - [1161, 0.0] - - - [64, 33660, 1, 3072] - - [1162, 0.0] - - - [3072, 33677, 1, 3072] - - [1163, 0.0] - - - [3072, 33664, 1, 12288] - - [1157, 0.0] - - - [3072, 33680, 1, 15360] - - [1158, 0.0] - - - [12288, 33680, 1, 3072] - - [1164, 0.0] - - - [12288, 33664, 1, 3072] - - [1165, 0.0] - - - [64, 14850, 1, 3072] - - [1166, 0.0] - - - [3072, 14856, 1, 12288] - - [1167, 0.0] - - - [12288, 14856, 1, 3072] - - [1168, 0.0] - - - [3072, 14872, 1, 15360] - - [1169, 0.0] - - - [12288, 14872, 1, 3072] - - [1170, 0.0] - - - [3072, 14850, 1, 3072] - - [1171, 0.0] - - - [16032, 128, 1, 16384] - - [1172, 0.0] - - - [16032, 64, 1, 16384] - - [1173, 0.0] - - - [16032, 16, 1, 16384] - - [1174, 0.0] - - - [16032, 8, 1, 16384] - - [1175, 0.0] - - - [16032, 32, 1, 16384] - - [1176, 0.0] - - - [128256, 8, 1, 8192] - - [1177, 0.0] - - - [128256, 16, 1, 8192] - - [1178, 0.0] - - - [128256, 64, 1, 8192] - - [1179, 0.0] - - - [128256, 128, 1, 8192] - - [1180, 0.0] - - - [128256, 32, 1, 8192] - - [1181, 0.0] - - - [128256, 124, 1, 8192] - - [1182, 0.0] - - - [128256, 127, 1, 8192] - - [1183, 0.0] - - - [128256, 125, 1, 8192] - - [1184, 0.0] - - - [128256, 126, 1, 8192] - - [1185, 0.0] - - - [128256, 123, 1, 8192] - - [1186, 0.0] - - - [128256, 122, 1, 8192] - - [1187, 0.0] - - - [5120, 4106, 1, 5120] - - [1191, 0.0] - - - [5120, 4106, 1, 13824] - - [1192, 0.0] - - - [5120, 4200, 1, 5120] - - [1193, 0.0] - - - [5120, 4200, 1, 13824] - - [1194, 0.0] - - - [5120, 21263, 1, 5120] - - [1197, 0.0] - - - [5120, 21263, 1, 13824] - - [1198, 0.0] - - - [5120, 21264, 1, 13824] - - [1199, 0.0] - - - [13824, 4106, 1, 5120] - - [1201, 0.0] - - - [13824, 4200, 1, 5120] - - [1202, 0.0] - - - [128, 18928, 1, 512] - - [1204, 0.0] - - - [1024, 150000, 1, 4096] - - [1205, 0.0] - - - [1134, 150000, 1, 2048] - - [1206, 0.0] - - - [2048, 150000, 1, 8192] - - [1207, 0.0] - - - [2268, 150000, 1, 4096] - - [1208, 0.0] - - - [4096, 150000, 1, 2048] - - [1209, 0.0] - - - [128, 32768, 1, 512] - - [1210, 0.0] - - - [128, 16800000, 1, 134] - - [1211, 0.0] - - - [576, 7680, 1, 576] - - [1212, 0.0] - - - [128, 14000000, 1, 134] - - [1213, 0.0] - - - [512, 7680, 1, 304] - - [1214, 0.0] - - - [512, 7680, 1, 512] - - [1215, 0.0] - - - [512, 7680, 1, 2048] - - [1216, 0.0] - - - [512, 26696, 1, 128] - - [1217, 0.0] - - - [576, 10, 1, 576] - - [1218, 0.0] - - - [576, 10, 1, 2304] - - [1219, 0.0] - - - [576, 1280, 1, 2304] - - [1220, 0.0] - - - [576, 1280, 1, 3840] - - [1221, 0.0] - - - [576, 7680, 1, 3840] - - [1222, 0.0] - - - [1024, 5, 1, 512] - - [1223, 0.0] - - - [1024, 5, 1, 1024] - - [1224, 0.0] - - - [1024, 125000, 1, 128] - - [1225, 0.0] - - - [20, 21760, 1, 3840] - - [1226, 0.0] - - - [1024, 125000, 1, 1024] - - [1227, 0.0] - - - [1152, 1280, 1, 576] - - [1228, 0.0] - - - [1728, 165, 1, 576] - - [1229, 0.0] - - - [1728, 1280, 1, 576] - - [1230, 0.0] - - - [1728, 7680, 1, 576] - - [1231, 0.0] - - - [2048, 1285, 1, 2048] - - [1232, 0.0] - - - [2048, 1285, 1, 3840] - - [1233, 0.0] - - - [2048, 1285, 1, 8192] - - [1234, 0.0] - - - [2048, 125000, 1, 8192] - - [1235, 0.0] - - - [2268, 125000, 1, 4096] - - [1236, 0.0] - - - [2304, 1280, 1, 576] - - [1237, 0.0] - - - [128, 26696, 1, 512] - - [1238, 0.0] - - - [3840, 1280, 1, 576] - - [1239, 0.0] - - - [3840, 1285, 1, 2048] - - [1240, 0.0] - - - [3840, 6400, 1, 3840] - - [1241, 0.0] - - - [3840, 7680, 1, 576] - - [1242, 0.0] - - - [3840, 11520, 1, 3840] - - [1243, 0.0] - - - [4096, 125000, 1, 1024] - - [1244, 0.0] - - - [4096, 125000, 1, 2048] - - [1245, 0.0] - - - [7680, 1280, 1, 3840] - - [1246, 0.0] - - - [96, 96, 10, 96] - - [1247, 0.0] - - - [128, 2989952, 1, 128] - - [1248, 0.0] - - - [16032, 5, 1, 16384] - - [1249, 0.0] - - - [32, 286720, 1, 257] - - [1250, 0.0] - - - [57, 491520, 1, 1219] - - [1252, 0.0] - - - [256, 196608, 1, 641] - - [1264, 0.0] - - - [256, 245760, 1, 641] - - [1265, 0.0] - - - [32, 2048, 1, 9984] - - [1279, 0.0] - - - [192, 170757, 1, 192] - - [1284, 0.0] - - - [192, 342315, 1, 448] - - [1285, 0.0] - - - [192, 359839, 1, 256] - - [1286, 0.0] - - - [256, 2048, 1, 15964] - - [1291, 0.0] - - - [256, 131072, 1, 128] - - [1293, 0.0] - - - [256, 131072, 1, 192] - - [1294, 0.0] - - - [512, 2048, 1, 9984] - - [1298, 0.0] - - - [512, 2048, 1, 10752] - - [1299, 0.0] - - - [768, 2048, 1, 15964] - - [1300, 0.0] - - - [1024, 2048, 1, 9984] - - [1302, 0.0] - - - [1024, 2048, 1, 10752] - - [1303, 0.0] - - - [1024, 2048, 1, 14336] - - [1304, 0.0] - - - [1024, 2048, 1, 32768] - - [1305, 0.0] - - - [1920, 2048, 1, 15964] - - [1306, 0.0] - - - [4096, 3840, 1, 8192] - - [1308, 0.0] - - - [4096, 3328, 1, 8192] - - [1309, 427.47] - - - [3584, 4096, 1, 8192] - - [1310, 0.0] - - - [3072, 4096, 1, 8192] - - [1311, 0.0] -- null -- null -- DeviceEfficiency -- Equality + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x10M96ahVYtg9nz7wVE0_jilGKPCC2Hn7G5j6 \ No newline at end of file From 5a79662bcc67a1e25927476c54c142841088ebc4 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Fri, 21 Nov 2025 02:13:54 +0000 Subject: [PATCH 10/18] typo --- ...k_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 173754 ++++++++++++++- 1 file changed, 173753 insertions(+), 1 deletion(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index e6c4c3ae58d..112955c78e2 100755 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -135970,4 +135970,173756 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x10M96ahVYtg9nz7wVE0_jilGKPCC2Hn7G5j6 \ No newline at end of file + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x10M96ahVYtg9nz7wVE0_jilGKPCC2Hn7G5j6hk7n8Cq4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 2 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 585 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC2_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1aItiZpnpRjcYIurxfBmFGJqO0Ddnngzyn750XF40uwQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29824 + LdsInitCVgprs: false + LdsNumBytes: 29824 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 586 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xI2u-Ror5FqkQ3j_XB1vG0HfqV0MUB2v1g9i5PeE9dK0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB4_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62336 + LdsInitCVgprs: false + LdsNumBytes: 62336 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB4_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1tIELiWthd6KUjMdtTQSxbaPibBS-rnYxP9egrRr9Qkg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB0_NTC2_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12480 + LdsInitCVgprs: false + LdsNumBytes: 12480 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 0 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB0_NTC2_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1xcpM29sdXgq9Obmz4PsmmNsNPdSFmNPCzaXJ7HLy9wE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25216 + LdsInitCVgprs: false + LdsNumBytes: 25216 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1bAwHyQ3GvY3vKl9IMh8l8so9YqF-GnfsyYCXe2IpfsU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29824 + LdsInitCVgprs: false + LdsNumBytes: 29824 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x64_MI16x1EUpWGBFC9M8r9mtOXEn2kt2875FYVpKPvAYu5RNAIJM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB2_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 24832 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 24832 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB2_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x1JtNQNtsnA1Sv8b8yh4PSwgVqK7nqobXjvnB5dDn1KZ0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC5_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 5 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC5_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT80x64x64_MI16x1jvoQqF8xs6wPfM77e_0lXt27oInqZdCHYYvS2jjTZjs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_1_MO40_NTn1_NTA1_NTB2_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53504 + LdsInitCVgprs: false + LdsNumBytes: 53504 + LdsNumElementsAlignedA: 11520 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 11520 + LdsOffsetB_Blk: 44288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 11520 + LdsOffsetMetadata_Blk: 44288 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 64 + MacroTileA: 80 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 593 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_1_MO40_NTn1_NTA1_NTB2_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x12bQCSUgEYzHwc-PD0joW94qw2tOVZsB4YM94lzol2GQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54656 + LdsInitCVgprs: false + LdsNumBytes: 54656 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 594 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xH274nfviM8iTEC9EqVMp7KNIKr5Tu4fbGuMgZChQLwI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59392 + LdsInitCVgprs: false + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 595 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI16xeitizeynoDBPdmSwRBF5Xg6yeYRaHhmb2AuHnk-Jjxw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC4_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59904 + LdsInitCVgprs: false + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 596 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC4_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x128x64_MI16xIyCic_6SCaQHmRo4gZpospLo38ih23M3n0i8gSNlkM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB5_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 103552 + LdsInitCVgprs: false + LdsNumBytes: 103552 + LdsNumElementsAlignedA: 21120 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 21120 + LdsOffsetB_Blk: 86656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21120 + LdsOffsetMetadata_Blk: 86656 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 5 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 597 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB5_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xf6FAbndEq795AH2jLpLXyL_zsKaM-nx7XEfVRnU4xGg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 598 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x512_MI16x9RYSCMS1YOFrGPF7jBc1TMhLBlf1ljMGKxQVtEqS2Lg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB2_NTC1_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115712 + LdsInitCVgprs: false + LdsNumBytes: 115712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 599 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB2_NTC1_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xSF36trc75_ee7cIk05feZIYCwFPm6GAFiS2-I-5FAbA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 3 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 600 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x128_MI16xtDurztqqli3v-jsi4_HIWyYY_P3H0AZYLDd26dnVkks= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110592 + LdsInitCVgprs: false + LdsNumBytes: 110592 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 601 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT48x64x256_MI16xkAW-i3iRPLi_hWhZCB7bemxTlrcNdInt5o4XHUQLbtM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB0_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 602 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA4_NTB0_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x80x256_MI16xs81qgGS4sNKWxhufdlT4wTK1vEJhN5Ffxrcx9SpGsKM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x80x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_5_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 43520 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 113152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 113152 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 80 + MacroTileA: 64 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 603 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x80x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_5_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x96x128_MI16xrLeoctTCB1bACoQ--beNqEzb0YI8LOUhpdHAYkRgIvY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA7_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 + LdsInitCVgprs: false + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 604 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA7_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x64x128_MI16xfNGTqyHzTAgsGJT4k9BPcwtHxtde0XJty9p1gmf49y8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA4_NTB2_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110080 + LdsInitCVgprs: false + LdsNumBytes: 110080 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 605 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA4_NTB2_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT48x64x256_MI16xTlqyDmTkj4a9CbMyFh6-oJ7vLAZn4NKSD09VWqfaX6s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB2_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 606 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB2_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x256_MI16xLmmGlj_Aj8A6fIMmFhO1dylq0PzddWmPhFmOjV8mNYs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 136192 + LdsInitCVgprs: false + LdsNumBytes: 136192 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 68096 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 607 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x96x128_MI16xTLhvHuGGTrNNoRNX0qiCIT3prWYKTLd7nnYcbxeKw70= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA5_NTB0_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120832 + LdsInitCVgprs: false + LdsNumBytes: 120832 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 608 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA5_NTB0_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16v7CFS67dToWu7E36jBzM3hnxFhhWEF1cSNV18qHrtEk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA1_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 609 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA1_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16ZWO7HpjPNmu6UA2V1hsB04cMUqxmkdqiKg8wJvcIyHk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA7_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 610 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA7_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI19h3rQP9aLCc3mGFwqilyX_IB29uLEdD2KWe4y57nhKw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 611 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1CxJiV14JTrK3H1ybea3EPczFFJRckm5EFanYUzn3ORk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC7_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 612 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC7_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x96x128_MI16xHJIMpCJEpG_CXiSzPpjlBMrZYiH1WeXOfw9riY9KTWc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120832 + LdsInitCVgprs: false + LdsNumBytes: 120832 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 613 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI16ngeiZGZG7mL9cfPgvCQlNFzzUu9wWiYYYs_RgDPtw9s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 614 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI1673d8kHswsq18xA7tmJKkurm3XT4Q-HnKYoAezcsdn54= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA2_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 615 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA2_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI32csEMSPPsusU5YMb_wufq8F6P-8Sxs2EgAC0PPFEv6gk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA2_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 616 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA2_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM16_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI1689RN_I3TvC1Gm0Z2xealtErSeUtX5KtCYijpRpZlPVk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 617 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16AzS5NoxFk2Au_HbPsvaytavfJD8hghd_sJGfMrTtqqc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA3_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 618 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA3_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x64_MI16AKi47eQ72s767fgFKjHxBMB61xROY48hH_ZC9Dg-uL4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA2_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 56832 + LdsInitCVgprs: false + LdsNumBytes: 56832 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 30720 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 56832 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 3] + MIWaveTileA: 12 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 619 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA2_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 3 + ThreadTileA: 48 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x224x64_MI16vPxjaC7jAP1WCZxG5tdxVnhuIV2nrtGe7eEx5M_A97k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA2_NTB2_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63488 + LdsInitCVgprs: false + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 7] + MIWaveTileA: 6 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 224 + MacroTileA: 192 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 168 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 6 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 620 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA2_NTB2_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 7 + ThreadTileA: 24 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16UfrTfwRtCk7X5b0oDzl9lO69mf5F9FKhG38KQAizz0I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 125440 + LdsInitCVgprs: false + LdsNumBytes: 125440 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 24 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 24 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 621 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x48x64_MI16x1zwucDcbYqoLc5jj3KVOwMSgVTbe-k-vlwhLtkP43A7w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB1_NTC0_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 31168 + LdsInitCVgprs: false + LdsNumBytes: 31168 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 6400 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 24832 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 24832 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 622 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB1_NTC0_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI16xhFp48AWAx0WFxNJS_zwhp3LH75FnBeuIGQ-PHfRGV8M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA7_NTB3_NTC1_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59904 + LdsInitCVgprs: false + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 623 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA7_NTB3_NTC1_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x192x64_MI16xDc85R6FBHvP3Db6U89LaQsZiTgNI0YXqfD8EZrc9sRY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99584 + LdsInitCVgprs: false + LdsNumBytes: 99584 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 3 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 624 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB3_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16vPoTMjKt8nWTFpbxSId3wmEKYDEB48u2cWczx9X8ORY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA6_NTB3_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 3 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 625 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA6_NTB3_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16BOWImfSEsD_3G-o1iXGR5QlXGk-JQslWP21I5MyceTU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA7_NTB0_NTC4_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110592 + LdsInitCVgprs: false + LdsNumBytes: 110592 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 626 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA7_NTB0_NTC4_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x160x64_MI16xWi346OG8MPM52jqLnWZLZZZ3iPSXNJNwpzIVgL5wxlU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA4_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65024 + LdsInitCVgprs: false + LdsNumBytes: 65024 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 20 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 20 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 627 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA4_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16PnbeKpDjqDFQuRGaKM1Bxep0ty_w-huZ0GCiXdgUOH0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 628 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16dZECwM1-ZUT8NrMGVw0isp2_dkCorYlpeB97j2fIEDQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB3_NTC6_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 629 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB3_NTC6_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16jYuVIUs7OMwAdUxYUsQi4ZYOBPt9XeBE6EvrTgfVOXM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA4_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104064 + LdsInitCVgprs: false + LdsNumBytes: 104064 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 16 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 630 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA4_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x128x64_MI160ChZLQK1-RbLvH-HVpy6jN_NmDI3jJlXjB9H7W24k_M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA2_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 105472 + LdsInitCVgprs: false + LdsNumBytes: 105472 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 2] + MIWaveTileA: 10 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 20 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 20 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 631 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA2_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 2 + ThreadTileA: 40 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16nbKJcGCHBDcxSonpixiDJ1Xzs9zasEp0fEz4y57YzZM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB3_NTC4_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 632 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB3_NTC4_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Qb0kRw4kC1BW5Axl2fZWWgBa72Xc93g5inENYkl8kQ4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 633 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT112x16x64_MI16xSPOJU0ZSldRFeT5QIftb2Rv0LMrYsuCg4tJSJO98EbA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA4_NTB3_NTC1_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49728 + LdsInitCVgprs: false + LdsNumBytes: 49728 + LdsNumElementsAlignedA: 14848 + LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14848 + LdsOffsetB_Blk: 47616 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 14848 + LdsOffsetMetadata_Blk: 47616 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 112 + MacroTile1: 16 + MacroTileA: 112 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 634 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA4_NTB3_NTC1_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT112x32x64_MI16xhCkqxVO4S_u07UIACIbdXayZs6cLW0nwjm56at5G-TY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51840 + LdsInitCVgprs: false + LdsNumBytes: 51840 + LdsNumElementsAlignedA: 14848 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14848 + LdsOffsetB_Blk: 47616 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 14848 + LdsOffsetMetadata_Blk: 47616 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 112 + MacroTile1: 32 + MacroTileA: 112 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 7 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 635 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x48x64_MI16x9yvEFw2CY7fogdvm7chr1dqncsv-JhOXUYRyF96wk04= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 6912 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 636 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xlATdcDJmHjTh2fb3_R4sw8cZpm4iDPSIziZbeZj6_2E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 637 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x64_MI16xuf_mQQYY6e_xseewo_eCdO8-75Ww2keRq6k-6Kte8TQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA7_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61184 + LdsInitCVgprs: false + LdsNumBytes: 61184 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 11520 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 638 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA7_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x16x64_MI16xphmWFUPXbOcidjCYWA4-uWMzUY_LXmzcCzaU91ZAfzg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x16x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA4_NTB2_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 74240 + LdsInitCVgprs: false + LdsNumBytes: 74240 + LdsNumElementsAlignedA: 71680 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 71680 + LdsOffsetB_Blk: 202752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 74240 + LdsOffsetMetadata_Blk: 202752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 16 + MacroTileA: 448 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 14 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 639 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x16x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA4_NTB2_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x32x64_MI16xyGJzXCsePrx3w8oPN_5-Z4ms4U5FQdG0p8cPMrUh6p0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_1_MO40_NTn1_NTA5_NTB1_NTC3_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 64512 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 64512 + LdsOffsetB_Blk: 134144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 64512 + LdsOffsetMetadata_Blk: 134144 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 1] + MIWaveTileA: 14 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 32 + MacroTileA: 448 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 14 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 640 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_1_MO40_NTn1_NTA5_NTB1_NTC3_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 1 + ThreadTileA: 56 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x64x64_MI16xAem7wjfwMpeIdy5IePcpzvkIPguZG_-48eS9Utsv2B0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT14_1_MO40_NTn1_NTA6_NTB0_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42496 + LdsInitCVgprs: false + LdsNumBytes: 42496 + LdsNumElementsAlignedA: 32256 + LdsNumElementsAlignedB: 10240 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32256 + LdsOffsetB_Blk: 97792 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42496 + LdsOffsetMetadata_Blk: 97792 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [14, 1] + MIWaveTileA: 14 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 7 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 641 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT14_1_MO40_NTn1_NTA6_NTB0_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 1 + ThreadTileA: 56 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x96x64_MI16xwVNXPPje9A9Oux2i66AN2WtWvGx_LcqCLKCNdupasEs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_3_MO40_NTn1_NTA7_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 159744 + LdsInitCVgprs: false + LdsNumBytes: 159744 + LdsNumElementsAlignedA: 64512 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 79872 + LdsOffsetB: 64512 + LdsOffsetB_Blk: 144384 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 64512 + LdsOffsetMetadata_Blk: 144384 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 3] + MIWaveTileA: 14 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 96 + MacroTileA: 448 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 168 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 14 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 642 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_3_MO40_NTn1_NTA7_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 3 + ThreadTileA: 56 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1Zxj7hmIhFf0e6sHLkyALBfq6r73zPsZ-XR3BoqHDSgk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB1_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12480 + LdsInitCVgprs: false + LdsNumBytes: 12480 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 643 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB1_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1XQlhZiKbwBTnsXPxp1q4o2irOq2zfp2Y65C9IZ3YoMQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12480 + LdsInitCVgprs: false + LdsNumBytes: 12480 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 644 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x64_MI16x1RDT_SAbG1Q4m8tjAod_ZUY3haGM8M2J7D6id1FgbeLE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 14592 + LdsInitCVgprs: false + LdsNumBytes: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 645 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x48x64_MI16x1MdzKbimZAKsiTS822naX-Cj47npLSbYpMQ1E6wBqIR4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA3_NTB3_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24896 + LdsInitCVgprs: false + LdsNumBytes: 24896 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 6400 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 18560 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 18560 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 48 + MacroTileA: 16 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 646 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA3_NTB3_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x64x64_MI16x1ET1ujsY8nygIYm_fI--IZvnOtQX-vgf9FL_uuSis_dM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27008 + LdsInitCVgprs: false + LdsNumBytes: 27008 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 18560 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 18560 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 647 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1ZK4lPjLxPtESxVESaBb0bjZvzm3PXcRqVZ9QNTtsFmU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25216 + LdsInitCVgprs: false + LdsNumBytes: 25216 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 5 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 648 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1ryhvjQy9Rt7q0NXyJWBm18l0Me0RTEddt73F0RdSnWg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 649 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xWDZ6BX4xpSToEnwcbyhgbIcvdSHipUU7aRUJWaWZoAQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62336 + LdsInitCVgprs: false + LdsNumBytes: 62336 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 650 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x64_MI16x1Z0vg-bZ_gL9MhYmVUEa6Ri_Qp759nhBFrg1Cxoodnjg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 14592 + LdsInitCVgprs: false + LdsNumBytes: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 651 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1rPu3O7LZ76vL2u7y5rr3EWOiCmLkbwq5NK6qXHp6gKc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB2_NTC2_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29824 + LdsInitCVgprs: false + LdsNumBytes: 29824 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 652 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB2_NTC2_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1tDT_hrw3fHD8_o3Zp1b1tGE2weQyHC32kJu7wlmKPxc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 2 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 653 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x96x64_MI16x1-HBv_7E9Iz2YhgL1w1MXVfJP3ZE20choBF-i3eydDf8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 37376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 654 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x1B4FrTyt9kfGdKb4c_hmAfgg7GdQSww8Qzv_iE0c3gi0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA1_NTB3_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 3 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 655 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA1_NTB3_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x1JEVCxGr47CIY-HI8qxrZmKgrc1wM-_JZuDpX1WhY7IM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA1_NTB2_NTC4_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 656 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA1_NTB2_NTC4_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1P7fU9OFzqfhEM9o3ZnBDL2oiSqY7E--X2JskGaL9FoA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24832 + LdsInitCVgprs: false + LdsNumBytes: 24832 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 657 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1BLbdM9dbMt3NUAhJq-FfugXosmhxsqmK_BkcQ5OAhOg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54656 + LdsInitCVgprs: false + LdsNumBytes: 54656 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 658 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xbKVSGGFgDw5i8McKOQa-A53vhnlFKW7c72QLmlhQisQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB3_NTC4_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB3_NTC4_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xOJCUhlFECuCTwzXpkC6jiu2Ixkoz_9HWuHIxaAxXqh8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 660 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16x8Lu9f2yKHuTv8Iy85nfNk3O3KnHLpzbK3KxUw1Mbok4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62848 + LdsInitCVgprs: false + LdsNumBytes: 62848 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 661 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x512_MI16xXRxh_faCRRltjL034xoMmk_TiKa3xdJCa74hUHvvHEU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 662 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x48x512_MI16xRWoVyAvU8jwb3j6CRTvndH2SL3yYuGMQRd2NzD81sLI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x48x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA6_NTB1_NTC0_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 50688 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 84480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 84480 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 48 + MacroTileA: 16 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 3 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 663 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x48x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA6_NTB1_NTC0_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x16x256_MI16xCiygDODDK4Fa2btyaNoWwcgWWpgaEN3yAsezRpzjY5c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 107776 + LdsInitCVgprs: false + LdsNumBytes: 107776 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 664 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xiervKaX2K61CRRejq1AG3_Z65vVaHOJ0L6-2dHWApwY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 665 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x256_MI16x7_taCQeo5Fw91ch_k5selL-SvvOT-Li1hQ45Oq47or0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 666 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x16x128_MI16xspEMKZ93Myckc_GOKIntGOAz2o43UgKBk4YuGWRg8fk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC2_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55808 + LdsInitCVgprs: false + LdsNumBytes: 55808 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 667 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC2_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xi-yvAevSwW9s0hapSLs7FgKWw6ot3l97yFKd4adNR2U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB0_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 668 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB0_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xOfKmRAiyYbD1ongki4w-aOH6B_Asv5QGII7Zid4nDtU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB1_NTC2_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 669 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB1_NTC2_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xIj575u2gUPbRKt9h6HW8Brd0cnlHlMiSmmqDVlgfL7I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 670 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x128_MI16xZ49CBLf5ScwOBmtxfyDIZSZP9YOFJqvvP643E4q-8r8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA3_NTB1_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59392 + LdsInitCVgprs: false + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 671 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA3_NTB1_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI169Pt6LbH02NexWojOA7qAB2-Fm2qPQ2e49zKWSkq3aQ4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 672 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16cwc8HxsQarpjYLPbWETQe3uyARxl1l74SNVlAyjgLDQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 673 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x128_MI16VUU44mm8BMcnxwEtHA89E_6df0UR4aTy2W8vQlkVcbA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 674 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x128_MI16ksL5sGakGBodVE2S4KzleS-f8oU1KqielXgO3mZSIQw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117248 + LdsInitCVgprs: false + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 675 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x1MjIFVZajzfoFkPKUGd62NeM2UNclTA5fyapd57LKLMA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA1_NTB2_NTC5_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 676 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA1_NTB2_NTC5_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x160x128_MI16NeMdRGaNIunm3_27NSGWaZZR4mRdMr8hAhKtRPCxJTM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 129024 + LdsInitCVgprs: false + LdsNumBytes: 129024 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 677 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x192x128_MI16nXFspfQ1n-JzXHNxfWztMu23CLya1Pfo9pLNlbt2Q7Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 144384 + LdsInitCVgprs: false + LdsNumBytes: 144384 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 55296 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 72192 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 89088 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 89088 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 678 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI16tAd_lkigyTdEYRjzMIK5_8l6RnyYhnBdukQFBQ-gD0M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA1_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 679 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA1_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1Dfy4Xf_-nNJ9qTF6JJW4iktgBz4PP6t0_lQo70vu8Pw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB3_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57344 + LdsInitCVgprs: false + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 680 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB3_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16TAya8fP-9detORb1PiFXCjXxw-JyWSf1Yy8FSpejObs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA3_NTB3_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 681 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA3_NTB3_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x96x64_MI16xpq67mkJqOOTRU5Z-xeSs2N2SBCRuU-O8RBVF0fAkxgo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA3_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 112000 + LdsInitCVgprs: false + LdsNumBytes: 112000 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 682 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA3_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16F5-Y3heVCa_launRUn-KSz1wHCDBn8ZAWYAzxkNo_44= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63488 + LdsInitCVgprs: false + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 15360 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 64 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x128x64_MI16gWrmBtg2_i8eryiIALzrySUiqdyXxbV8TJwUnamyhPQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_2_MO40_NTn1_NTA2_NTB2_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 + LdsInitCVgprs: false + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 2] + MIWaveTileA: 12 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 684 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_2_MO40_NTn1_NTA2_NTB2_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16EzTqgiXgPBF_ARPg_cK-JoXMOyRPW6J3lOFjOMJa4Ho= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA2_NTB2_NTC1_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 125440 + LdsInitCVgprs: false + LdsNumBytes: 125440 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 24 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 24 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 685 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA2_NTB2_NTC1_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x18L2jmuT-hMIzEZWLTh1LIKclLsLpudAVc-YTJZ2phKQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 686 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI16xDCe3l8na3f27VLtnVlzFGVavj96L_Azsll4KulPHqsI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA6_NTB0_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59904 + LdsInitCVgprs: false + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 687 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA6_NTB0_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xVIw57mRIRYH0m_CcIZDK-p-YdVEXWxLXeDaRPF09Dbw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA6_NTB1_NTC0_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62336 + LdsInitCVgprs: false + LdsNumBytes: 62336 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 688 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA6_NTB1_NTC0_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16z31fl2N-0jv2niJNtBtZg1a7wQy6gEnUqHfWWGXiT3E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA6_NTB2_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 689 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA6_NTB2_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16c4B1pd6abobRt0oLrz-drDK66b4TY25EF2_dOH_x7U4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA7_NTB0_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110592 + LdsInitCVgprs: false + LdsNumBytes: 110592 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 690 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA7_NTB0_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x192x64_MI16xwNdA3XAVZgNp7B97A8M72_NWG1tfTcIrxSB0QVt12A0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA4_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99584 + LdsInitCVgprs: false + LdsNumBytes: 99584 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 691 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA4_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16tGuzVf8otnHrXADhFHX3DGYqRygBCmdCoE5Fl-ZtZ1Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA6_NTB3_NTC1_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99840 + LdsInitCVgprs: false + LdsNumBytes: 99840 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 692 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA6_NTB3_NTC1_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16cvElwXhZzoI5bHsRvbpqVI_TQ8sdHbKbNSqGN-UZTOg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 693 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16eECx5F9F_BhjbKQ29yTAgPVBEIVPLCxsVGWNpAn05IU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA4_NTB0_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104064 + LdsInitCVgprs: false + LdsNumBytes: 104064 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 16 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 694 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA4_NTB0_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16QXtD7oXSMnmIF8hDGFn7uvC5RdO-XbtSlTg2Daq-GCE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA6_NTB2_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 107776 + LdsInitCVgprs: false + LdsNumBytes: 107776 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 695 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA6_NTB2_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16kGGVlLQYO67WzbEjuBWcMuovcPe3dUezUjRu8kulS10= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 696 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x16x64_MI16xzAdJSKxtc6G2i4oH_Hj-ERRE93AFOiKixcHMN4uLcXQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA6_NTB1_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51968 + LdsInitCVgprs: false + LdsNumBytes: 51968 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 697 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA6_NTB1_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x16x64_MI16x_nXPOwLHN3tUCPcweu7IEYYExrIq38iv1_FscmoR-Aw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA4_NTB2_NTC2_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51776 + LdsInitCVgprs: false + LdsNumBytes: 51776 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 698 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA4_NTB2_NTC2_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT112x32x64_MI16xKLw0NFHWmTaLgGANNLe0WN9LcfTlWk51L14snqqNok8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA1_NTB2_NTC0_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51840 + LdsInitCVgprs: false + LdsNumBytes: 51840 + LdsNumElementsAlignedA: 14848 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14848 + LdsOffsetB_Blk: 47616 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 14848 + LdsOffsetMetadata_Blk: 47616 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 112 + MacroTile1: 32 + MacroTileA: 112 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 7 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 699 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA1_NTB2_NTC0_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x48x64_MI16x9wGPz7-Mii8l-IhgHa7zhP1w2g2Ym4tR2F7tq6DEi0Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA4_NTB2_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 56576 + LdsInitCVgprs: false + LdsNumBytes: 56576 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 6912 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 700 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA4_NTB2_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xPabkvQBq6PTiaDnOaVG-y_ngipkRJNKgbjS24MBfNqk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA4_NTB3_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 701 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA4_NTB3_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT112x128x64_MI16TfLTsnYrpucGsLz8gM1xsamGYVXiPxeFu05ySX1YaOk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT7_2_MO40_NTn1_NTA4_NTB3_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 98560 + LdsInitCVgprs: false + LdsNumBytes: 98560 + LdsNumElementsAlignedA: 16128 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 112 + MacroTile1: 128 + MacroTileA: 112 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 702 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT7_2_MO40_NTn1_NTA4_NTB3_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x16x64_MI16x9oEas0UOpGUOmhyse3IXS7SIr_ndKahc3Jx4Suz2H-U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x16x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA7_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 74240 + LdsInitCVgprs: false + LdsNumBytes: 74240 + LdsNumElementsAlignedA: 71680 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 71680 + LdsOffsetB_Blk: 202752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 74240 + LdsOffsetMetadata_Blk: 202752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 16 + MacroTileA: 448 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 14 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 703 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x16x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA7_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x64x64_MI16xeVBJlxCeUfN_ihjeytHEHpBd2522A2rZt89DOT-bqvw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 64512 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 64512 + LdsOffsetB_Blk: 195584 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 195584 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 2] + MIWaveTileA: 14 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 64 + MacroTileA: 448 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 704 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 2 + ThreadTileA: 56 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x64x64_MI16xO93OcQCctQNjvd4HoNyaVKrJHjm_AEXCRD9SlC29sEY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_1_MO40_NTn1_NTA5_NTB2_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33280 + LdsInitCVgprs: false + LdsNumBytes: 33280 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 10240 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 1] + MIWaveTileA: 10 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 5 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 705 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_1_MO40_NTn1_NTA5_NTB2_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 1 + ThreadTileA: 40 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x128x64_MI16qXhzyG-FakFAbib9NlL1w8ROnfbqFiOW7XA57Hqai5w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA6_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 163840 + LdsInitCVgprs: false + LdsNumBytes: 163840 + LdsNumElementsAlignedA: 64512 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 81920 + LdsOffsetB: 64512 + LdsOffsetB_Blk: 146432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 64512 + LdsOffsetMetadata_Blk: 146432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [14, 4] + MIWaveTileA: 14 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 128 + MacroTileA: 448 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 706 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA6_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 4 + ThreadTileA: 56 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1NCytB9k0ij3z4uf6B317BIYNTx4tr48y9CngTD1U4ZI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12480 + LdsInitCVgprs: false + LdsNumBytes: 12480 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 2 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 707 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1YtPZCOLi-DnSw5IVU7dP5eBr01NxaSBDvkq89RKa8pA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC0_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12480 + LdsInitCVgprs: false + LdsNumBytes: 12480 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 708 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC0_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1GId-X8cKHc-l_ELt7OdAoaqSR_hvch9JTa1nGl6jiY0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12480 + LdsInitCVgprs: false + LdsNumBytes: 12480 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 709 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x64_MI16x1Cd41uzHX9EztoGVefiFEAAyJdVVCF8AvQy7SMCZ8NNE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 14592 + LdsInitCVgprs: false + LdsNumBytes: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 710 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x64_MI16x10DytydjFaSLdZouDuqkxSLEH6VzuTGNc9jervTdQBOU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 14592 + LdsInitCVgprs: false + LdsNumBytes: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 711 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x64x64_MI16x1tdrQ8qpLoUomTcqwbKmBEEvcHHPcszC0bIHRtUtSkcs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27008 + LdsInitCVgprs: false + LdsNumBytes: 27008 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 18560 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 18560 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 712 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_4_MO40_NTn1_NTA0_NTB3_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1UPuQ8Zwz0zy3F2ssNB85ESh41zQ_fTaxnDqDRps9bzg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA1_NTB2_NTC1_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 713 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA1_NTB2_NTC1_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x12jV8nCEXICeDeNZR3rpMOdU2WL28KVKkvtHvBJEjsoE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29824 + LdsInitCVgprs: false + LdsNumBytes: 29824 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 714 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1c6s2CEIFL5M-Q5ue-1R60z2fLeGtlkMFej_fdZmZYUg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC2_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29824 + LdsInitCVgprs: false + LdsNumBytes: 29824 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 2 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 715 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC2_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1R3wMxpmrTzbHyUNQAHs0qpo4qAauWtxgQvYZS-wtR2M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29824 + LdsInitCVgprs: false + LdsNumBytes: 29824 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 716 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x64_MI16x1cURYFfuow7g066ypzbjOK3jjY2fuD-5KEaDvgcYCHyc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 24832 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 24832 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 717 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xlcYR8juFWCw_Ix8760EPVoEwO3Roqo9_rr2p9Xkv5ek= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB4_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62336 + LdsInitCVgprs: false + LdsNumBytes: 62336 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 3 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 718 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB4_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x64_MI16x1R5V35wpj69NpM5mzUXf0Zhcai3NKdhVE_Jz7gSQ7xFY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 14592 + LdsInitCVgprs: false + LdsNumBytes: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 719 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x64x64_MI16x1SHcKpbeMDyXBrCXa7M1d0hY-5V3YQTXNV2k962zKu2Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB1_NTC2_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27008 + LdsInitCVgprs: false + LdsNumBytes: 27008 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 18560 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 18560 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 720 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB1_NTC2_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1omuc73oyBR7U3NzjgQXGNljyGZxlh7dEqHfHebLJ2X0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25216 + LdsInitCVgprs: false + LdsNumBytes: 25216 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 721 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1dgun9e__Gzmtlud0Xf36rbsFBLUR_xE9XpjsbVjo4AY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA6_NTB3_NTC3_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 3 + NonTemporalC: 3 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 722 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA6_NTB3_NTC3_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x64_MI16x1B_Lyvp5dWeKixwi2gHOu9zLZ6JRmHuHAmmRBFR30n8g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB3_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 2 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 723 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB3_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x1i3QEJiVmGb5gjfuyEykQWutr2YwUiiX3ahAHDecMx8E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA3_NTB2_NTC7_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 724 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA3_NTB2_NTC7_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1nUve6Doosp5bmngRKmeuvc6llO1F-P1ilbrDBwTYSdU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24832 + LdsInitCVgprs: false + LdsNumBytes: 24832 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 725 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1BdCFJziQpw7DeYX3m0AwcEnWlr9uI3hLVhfPV4G8lLI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA1_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54656 + LdsInitCVgprs: false + LdsNumBytes: 54656 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 726 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA1_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI16xOMPy7NClxqyBEQUKpVREQHDGJFJDf9KHH_WDlph19Ls= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC7_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59904 + LdsInitCVgprs: false + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 727 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC7_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI16x-gnAx6fuSUAg2DEBKpW-O5EpMF2gXA5frMYyk-me2l4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB2_NTC4_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59904 + LdsInitCVgprs: false + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 728 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB2_NTC4_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16qkbgejEY4-sN3GnmO2JLIUuX_8eYRonn7mqNgAy1jrI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 729 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x128x64_MI16sbD4OYYxqA_VKyhrGIGV6QLEWjZ22THuaZamVyDZ1rk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA2_NTB4_NTC2_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 107008 + LdsInitCVgprs: false + LdsNumBytes: 107008 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 2] + MIWaveTileA: 10 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 4 + NonTemporalC: 2 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 730 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA2_NTB4_NTC2_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 2 + ThreadTileA: 40 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x256_MI16xX2LWdHMBiiJK0leRqPVs9QFN-C7DoG2d2I1SMPhzufs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC3_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 731 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC3_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x256_MI16x-0x9A1X6-o7F-dO-YNjYWFeZ88aJFZOUzdk6u-NNMP4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 732 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xlKsy40ZEiJZd43bJ8yzOCJAsvlRp8LbLilH7qcMbK-s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC2_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 2 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 733 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC2_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xpthvBqJYAMQu7bVeBH4gsv2lMS_aiy2Ljzd-hbl7AIc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 734 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB2_NTC2_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x256_MI16xz342g_PxsMKOAbNGNIaWZR0PuLKRdlhWo1bnpC_WjuI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 735 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x48x128_MI16x4-M9nFXM8QfT1da5UDhI3BmFjNd00BGpMdvm3u5_lyk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA1_NTB0_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65024 + LdsInitCVgprs: false + LdsNumBytes: 65024 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 736 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA1_NTB0_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xwvgqld36rRgCy8XCu2RU_zvid-zmePF_xh9lAZ7loO8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC2_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 2 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 737 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC2_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xNjBWpMn3c1heP4-ZRijEunrZs2HE1y40j8U-EfP2U2k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB2_NTC2_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 738 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB2_NTC2_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xvejCouANmUuYraciG-6uOt0f2njtbmy4tUtKdVZGg_0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 739 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x128_MI16xk7yXgpslShWlYmKP4vu12_Y2qkIHXCYRxkeBBpLKj68= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110592 + LdsInitCVgprs: false + LdsNumBytes: 110592 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 740 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x64x128_MI16xQWdyuM1_8TsD3HiDTMLRw0nL3MEV30at8Lf6zEwYsvw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_1_MO40_NTn1_NTA0_NTB3_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110080 + LdsInitCVgprs: false + LdsNumBytes: 110080 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 741 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_1_MO40_NTn1_NTA0_NTB3_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x64x128_MI16xh_pjhf94hQ3YzECoL_Y99V1Y8Jm6JTWPVHKWAO2JxRk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA1_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110080 + LdsInitCVgprs: false + LdsNumBytes: 110080 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 742 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA1_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x128_MI16RdakLaLP4h1D6BJCriDlyUbGc0GeGn8QmI0SPC83Xwg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 743 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x128_MI16FE9cjsjuCQjPTd2OQ1h0Y7_M7XPKH0wXDeSkx1FruHI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 744 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI16x1NtnJtSv1Fr9l6BR5Qd8-oJfWiWRZ6lDC-4KEx-A0LrI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB2_NTC6_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49920 + LdsInitCVgprs: false + LdsNumBytes: 49920 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 745 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB2_NTC6_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI16ZO7FwDptmg3_UgTeoEluKean-4jY-HbF_httdSGTmiY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 746 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x192x128_MI16RSvcZUuQAJ7HS9i0hyvaUWbChM_7uAM_mTBAzePd0F0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB1_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 144384 + LdsInitCVgprs: false + LdsNumBytes: 144384 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 55296 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 72192 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 89088 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 89088 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 747 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB1_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI16kwz7SePrqsr-88UBxVyY3Nncw5xR5BFYDbTaSvEJbhw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 748 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16_b6b9vrrOsRF9FjsMoIWMKbPWc1tw4y9bNsA-zT8-xM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA2_NTB2_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 749 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA2_NTB2_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16dbj71rapkPJ3wps8_sYFT11dMXVH_jpdCNybsnCvI4c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA3_NTB1_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 750 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA3_NTB1_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x128x64_MI16muNJiITd5z-woYWAMUnor_jijVfn7HMPmEZ0oF3tqa0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA3_NTB3_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110592 + LdsInitCVgprs: false + LdsNumBytes: 110592 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 751 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA3_NTB3_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x224x64_MI16YukZKE8Da5cmVgOebjiRra8TOGmOnkexQwMOY8G7Tw8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA1_NTB3_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63488 + LdsInitCVgprs: false + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 7] + MIWaveTileA: 6 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 224 + MacroTileA: 192 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 168 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 6 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 752 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA1_NTB3_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 7 + ThreadTileA: 24 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x128x64_MI16RBY6LZx8bRDK-Xx7Ag9NIH7PTJYLmTmmrxghZ8rhPlY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_2_MO40_NTn1_NTA1_NTB0_NTC5_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 + LdsInitCVgprs: false + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 2] + MIWaveTileA: 12 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 5 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 753 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_2_MO40_NTn1_NTA1_NTB0_NTC5_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x64_MI16x1zuux-pVRl-1wfzxsFqtQU4GPxQDw3gPxbhsnul0VfaE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA5_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 24832 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 24832 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 754 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA5_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1wgOh7myiPvMSmjC0cxkJ85G_3aBg8ZvwOa_r8QT5oq0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA5_NTB3_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54656 + LdsInitCVgprs: false + LdsNumBytes: 54656 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 755 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA5_NTB3_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xzgUl_qnlemPLgz6lX6E9lpTMaBR2BJtFQM7YKLibM7M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB1_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62848 + LdsInitCVgprs: false + LdsNumBytes: 62848 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 756 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB1_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI32aRHb0rmz88BLMt7H-FvB4D1dm7-xNP0MDgAfTulTqCw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA6_NTB1_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99072 + LdsInitCVgprs: false + LdsNumBytes: 99072 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 757 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA6_NTB1_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 1 + ThreadTileA: 64 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16_YTPowexxKe3Jsx4qj9u4vtT3_Vi3jGEXqIrZCij8wU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA7_NTB0_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 103552 + LdsInitCVgprs: false + LdsNumBytes: 103552 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 758 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA7_NTB0_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16GsV-fziBR_T47iTI6Sz59Ao6-CUbb1-0fp9ysnl62Hg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA6_NTB3_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 112000 + LdsInitCVgprs: false + LdsNumBytes: 112000 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 759 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA6_NTB3_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xIlhxOH1Oqh9hMZmbXp3S_VoNh2dUxECCbYiRg30C--E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA1_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65024 + LdsInitCVgprs: false + LdsNumBytes: 65024 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 760 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA1_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16HNTrjnzevfXqAlxoeWd0GjjxJMyU3QHa3x7ak36qRKU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA6_NTB0_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99840 + LdsInitCVgprs: false + LdsNumBytes: 99840 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 761 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA6_NTB0_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16LxPJprZPJqL2HUBlQ1rYrzOdiZyfb2hiRAfmmzhc6-I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA7_NTB0_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 103552 + LdsInitCVgprs: false + LdsNumBytes: 103552 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 762 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA7_NTB0_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x128x64_MI16gBsvtvKrEeTuCwRBOupt4cpfibONGYq2uPZ0SHbbfY8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA1_NTB1_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 105472 + LdsInitCVgprs: false + LdsNumBytes: 105472 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 2] + MIWaveTileA: 10 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 20 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 20 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 763 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA1_NTB1_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 2 + ThreadTileA: 40 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x384x64_MI16nmuzXpgsMN9AfDrnVukONHqfCmykHd7p7Kgqegj8TB4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x384x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA2_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 50688 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 84480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 84480 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 764 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x384x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA2_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16_IqOxiGBDNP8ml49xazJLcAIIB2zcLhzL3v-NRUmSqk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 765 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x16x64_MI16xaHIUqumpYMOd_eSWVGafheovipUJxlMuIr2-iTHYjGI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA5_NTB0_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51968 + LdsInitCVgprs: false + LdsNumBytes: 51968 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 766 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA5_NTB0_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1cP6IgbIVYUJrrrs2fHm3OtbSZ3P2bg2FSpaoSmHcKf8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24832 + LdsInitCVgprs: false + LdsNumBytes: 24832 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 767 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x48x64_MI16xmH7RnjKq3nvhzyoQ47EcGZgIa8QRqzxrV8MdEDpU-ZE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 6912 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 768 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT112x64x64_MI16xxLO83H1JHUDs3HCJ0kcE-UkraK1VFdP49cA-cwHsgTM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA2_NTB0_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 16128 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 48896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 48896 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 112 + MacroTile1: 64 + MacroTileA: 112 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 14 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 769 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT112x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA2_NTB0_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16x5QMdH8YrjjeCSh_avPALQzY5SB_v1EPRRtI5SfrFqqQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB1_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62848 + LdsInitCVgprs: false + LdsNumBytes: 62848 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 770 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB1_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT448x16x64_MI16xmIZJTXVn3RUnJZbpj_qqvGQHG7gFV19O_PCjRx4HjtM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x16x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT7_1_MO40_NTn1_NTA6_NTB0_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 74240 + LdsInitCVgprs: false + LdsNumBytes: 74240 + LdsNumElementsAlignedA: 71680 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 71680 + LdsOffsetB_Blk: 202752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 74240 + LdsOffsetMetadata_Blk: 202752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 448 + MacroTile1: 16 + MacroTileA: 448 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 771 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x16x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT7_1_MO40_NTn1_NTA6_NTB0_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT416x32x64_MI16xb3XFV5kpSoh75iGZ49SShhlugugjIq5b23TsJ-Lk_dE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT416x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT13_1_MO40_NTn1_NTA5_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 54912 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 54912 + LdsOffsetB_Blk: 120448 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 54912 + LdsOffsetMetadata_Blk: 120448 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [13, 1] + MIWaveTileA: 13 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 416 + MacroTile1: 32 + MacroTileA: 416 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 52 + NumGlobalWriteVectorsPerThread: 52 + NumLoadsA: 13 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 13 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 772 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT416x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT13_1_MO40_NTn1_NTA5_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 52 + ThreadTile1: 1 + ThreadTileA: 52 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x64x64_MI16x9RLO9cqrWVBnf5WW56Yguk0AKiGz8o1xu1IbeQC3gHY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_1_MO40_NTn1_NTA5_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42496 + LdsInitCVgprs: false + LdsNumBytes: 42496 + LdsNumElementsAlignedA: 32256 + LdsNumElementsAlignedB: 10240 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32256 + LdsOffsetB_Blk: 97792 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42496 + LdsOffsetMetadata_Blk: 97792 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [14, 1] + MIWaveTileA: 14 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 7 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 773 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_1_MO40_NTn1_NTA5_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 56 + ThreadTile1: 1 + ThreadTileA: 56 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x96x64_MI16xyPY5B5hOQbVhJsxdxS4er--OQ-G5NLIqCO2zb6wLrrM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x96x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA6_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 3] + MIWaveTileA: 7 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 96 + MacroTileA: 224 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 84 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 7 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 774 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x96x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA6_NTB1_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 3 + ThreadTileA: 28 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xGRJ948PW2GY3Oh-BgL5E3cP96ILTWrBUyogE0huOZFk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 775 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM16_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x48x256_MI16x32d0yrzBuc6cQz2druL4nmWJZCsoXcp2TWx6BH5c44M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 107776 + LdsInitCVgprs: false + LdsNumBytes: 107776 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 5 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 776 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x80x256_MI16xFfV_Lyi8V7Vj9rgZw1A__J5um7q_2JfLpTqYZudeVf8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x80x256_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA2_NTB5_NTC2_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60928 + LdsInitCVgprs: false + LdsNumBytes: 60928 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 43520 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 80 + MacroTileA: 32 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 2 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 10 + NumGlobalWriteVectorsPerThread: 10 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 777 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x80x256_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA2_NTB5_NTC2_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x160x128_MI16AbMFVKD5zbFEAS433D1hwrSvtHdmdgk3gF2Lnchkhdg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x160x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA3_NTB4_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 + LdsInitCVgprs: false + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 160 + MacroTileA: 32 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 778 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x160x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA3_NTB4_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x512_MI16xGQWs24PV7Ym1YlzWNb85RVCaZ6eCHgA5WP9fLETVl0M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB6_NTC3_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 3 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 779 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB6_NTC3_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x512_MI16xS5DY8moH66uzpr6SJxsOicRUOH_OQ0U6eqycBbQQpKk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 3 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 780 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x256_MI16xK2rvBSaK8FYS6JPr6NUxMFfDsXUoeePQtJqs_k2sb-k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB6_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 781 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB6_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x512_MI16xjr5IN7ix-0abROpjyU0ooEpE9LYZoM-aJ7Ri-cIzzGk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB4_NTC2_NTD7_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115712 + LdsInitCVgprs: false + LdsNumBytes: 115712 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 4 + NonTemporalC: 2 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 782 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB4_NTC2_NTD7_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x512_MI16xsTI5BWTuHboBtEqod2TVB8lK-ydy8-gnlqHKaYHgf9Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC3_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 7 + NonTemporalC: 3 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 783 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC3_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x256_MI16x2GFdk17QQ3uA1d6t9wU-zMO5ILQ12zaI9jC-3SbeW3w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB6_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 784 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB6_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x160x64_MI16xIEJtM0Qs-Qt5ANRrJRe1y6NvM51H-1dK68AJYkLeue8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA2_NTB7_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63104 + LdsInitCVgprs: false + LdsNumBytes: 63104 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 785 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA2_NTB7_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x256_MI16xeEawEtm9tN_EZpDn4_hzdfTUqHTyl86g1Ec6oFWG5Ko= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA3_NTB6_NTC0_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 6 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 786 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA3_NTB6_NTC0_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x96x128_MI16x3s3lr2-RoTRlIwDQjjn2ZSVERgN7-naW9cscRwbJKW4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 + LdsInitCVgprs: false + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 787 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x32_MI16xrGuCXEyJQELfLKWrgj38v3ZHz02h1cWtKueGMrvcM6g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 7680 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 788 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x96x64_MI16x7PXNBPx-awrBePFiQtA7_cPxzuUzObGAITEOG6nBd_I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA3_NTB7_NTC7_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 112000 + LdsInitCVgprs: false + LdsNumBytes: 112000 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 7 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 789 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA3_NTB7_NTC7_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16ZqWfdAnZESKd0mXkLxfL_F0MIMePupboVSfGTQic0Es= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA2_NTB7_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 107776 + LdsInitCVgprs: false + LdsNumBytes: 107776 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 790 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA2_NTB7_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16UZBV0Y--yvpSgMgqfad2-IgosEc_9T8mQsqktcRmzOE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC7_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 4 + NonTemporalC: 7 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 791 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC7_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x128_MI16xDKx3iHWUtLm2OzQG30OfE_ZHq_7AQXRRMtC4or3g8qc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24832 + LdsInitCVgprs: false + LdsNumBytes: 24832 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 792 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xVS6-8aSM6wy2k6RWi_rYvzRomSA3jlwT8gPLCp-eAwk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 793 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x16x64_MI16x1tn-CMhx1xYKpMgOqCcpBalQhLAky9RcYoTGCvj9LiAs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27136 + LdsInitCVgprs: false + LdsNumBytes: 27136 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 24832 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 24832 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 5 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 794 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x256_MI16xpaqLB2iptpeDBUIZq9RHY34zWEG6sfG_iD3vwCS6rvc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 795 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1ST08a-lQgNJzT-bJf5BENGSqbDLJjbqK9ZONXtkOIO0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB1_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54656 + LdsInitCVgprs: false + LdsNumBytes: 54656 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 796 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB1_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT80x64x256_MI16xIqItKtjOeUz3Ra2yL7eq9_6MF0QGtPXxrqjl9UDVFBI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 43520 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 43520 + LdsOffsetB_Blk: 121856 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 121856 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 64 + MacroTileA: 80 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 797 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT80x128x128_MI16a1GgaJ1XhpWBaiZAklfYPebdZd3fO_FZZ5JDkBjcRSc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 128 + MacroTileA: 80 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 798 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x160x64_MI16Tj46MrSmcjsXsLpb8F4XMOc9k2TfvJyIN75AYpc9rNk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA2_NTB1_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 5] + MIWaveTileA: 6 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 160 + MacroTileA: 192 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 799 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA2_NTB1_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 5 + ThreadTileA: 24 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT288x96x64_MI16xcW6aDiVO9kNXRsYsIYuAvUGZD4Wy-UMkRlFyQ9a6KBc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT288x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT9_3_MO40_NTn1_NTA3_NTB4_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 111616 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 111616 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [9, 3] + MIWaveTileA: 9 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 288 + MacroTile1: 96 + MacroTileA: 288 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 108 + NumGlobalWriteVectorsPerThread: 108 + NumLoadsA: 9 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 800 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT288x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT9_3_MO40_NTn1_NTA3_NTB4_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 36 + ThreadTile1: 3 + ThreadTileA: 36 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xR-0zy6tezCC3idb-nW6ozYgH4I_DR2dqYJ-S39GhMmA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC2_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 801 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC2_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xLQozifCqpnSazKcFwV_qVVGa5O46gmyk__cb9RHQQ8g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC3_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 802 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC3_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16xKO0dNQa5bcn8HBJvyQ27-6Spm9wJOYkxisdXIhwkEfI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62336 + LdsInitCVgprs: false + LdsNumBytes: 62336 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 803 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x96x128_MI16xcQJaylFGXq_0TX9Hl4UpLe98bgmSD52B0suD_JFJXdc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA2_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120832 + LdsInitCVgprs: false + LdsNumBytes: 120832 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 804 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA2_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x64_MI16x1XSaY9E04TUGhE7jaKwxVi1iMJUyzWT0p2SC6I5Xq3dI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 24832 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 24832 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 805 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI16xlo5dPaFD1Z2wNLfWpqUVXrwi4qp1VyXkrK1mHg9vrBk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59904 + LdsInitCVgprs: false + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 806 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16not8US0zBp6KQnvU6T7iGKsGOpHJyy6OXpjVcrh8di0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA3_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 125440 + LdsInitCVgprs: false + LdsNumBytes: 125440 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 24 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 24 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 807 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA3_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1h7E_d5sJJteGIIjL-Lj0Y2uruXc0_DPC4873LrLNlQc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA3_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 159744 + LdsInitCVgprs: false + LdsNumBytes: 159744 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 79872 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 113664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 113664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 808 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA3_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x192x128_MI166Yhc8rndgFfNj72uWs6aJ8gw1o3E1jyb1Xl1FO_HXJc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x192x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA3_NTB1_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 162816 + LdsInitCVgprs: false + LdsNumBytes: 162816 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 55296 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 81408 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 107520 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 107520 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 809 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x192x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA3_NTB1_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x64_MI1653RT6YX0qL65pbCWKVRSCXEybm7S--Kgpdzo9IAq34I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB0_NTC7_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120832 + LdsInitCVgprs: false + LdsNumBytes: 120832 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 810 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB0_NTC7_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16lyiZ_OWA7CBIxG5UQu-LoZ4PeF9fqQR2IL76Gm18yq4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA0_NTB0_NTC6_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 811 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA0_NTB0_NTC6_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16FpHBpJztLFdPOpQm5LgXjXS6thXX1EofipNUGlAccrM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA2_NTB1_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 812 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA2_NTB1_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1peVBAmQGI6Gk8S474Te1ZTRXqknwudSRUC27VYvODes= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12480 + LdsInitCVgprs: false + LdsNumBytes: 12480 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 813 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI32x3or2ZfDxoEVPyIE9ENx34n2Ta6maweLlfi_muaxl1jt4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49792 + LdsInitCVgprs: false + LdsNumBytes: 49792 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8320 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 814 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16F92daEiSH1niks6NSxdTY8_WZekz2j79_GMV5tkmidw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 103552 + LdsInitCVgprs: false + LdsNumBytes: 103552 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 815 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16O-lmJvNZ1yZAl5hwmLIls_3x7xZrDhodvBwMedgcR8s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB3_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 816 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB3_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x128_MI16f28emv7KjCxL7hEsAfYMGEjDqXKAlsfhf-zSWgM_SE4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA4_NTB2_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 817 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA4_NTB2_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3WK9qxSffVXxVRK0eEFODtLFet-BzA4MlNP4jvdkjtYU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA5_NTB0_NTC6_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 136192 + LdsInitCVgprs: false + LdsNumBytes: 136192 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 68096 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 818 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA5_NTB0_NTC6_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 1 + ThreadTileA: 64 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16mttDLu8BtirRNUBQVYgcmsN4b0XMQewoI9Uyje1UTQM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA4_NTB2_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 819 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA4_NTB2_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3X3GmYl-7ra3jIh9fVES1YTKpNJhlxYauDVyCOafbM_k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 820 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1MSg7HvaEv6UulCPk3flAi9a5H9WOAXEFX3ej6sG_23Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_10_MO40_NTn1_NTA1_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 43520 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 113152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 113152 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 10] + MIWaveTileA: 2 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 821 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_10_MO40_NTn1_NTA1_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 10 + ThreadTileA: 8 + ThreadTileB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16G9rWlHGj8AkE0shTbmuyb8kBmQeD0vtA-3rtxqskVSo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA5_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110592 + LdsInitCVgprs: false + LdsNumBytes: 110592 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 822 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA5_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI165-7blOixwN1FuEz7zrKBwR46Fnw4Xw_XU9I4GbQUS_M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA7_NTB2_NTC6_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 118784 + LdsInitCVgprs: false + LdsNumBytes: 118784 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 823 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA7_NTB2_NTC6_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16c39Ae3LGGzg5uhOwIVro1DsGeSOQFAOSrUr5RBywozg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 824 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16wj34IQ8H1F7QK2OFVXhCekHWMDqTfcCtKjlW7CWIPNo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB0_NTC2_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 2 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 825 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB0_NTC2_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32qUwpgF5DFZUbxYcCe9WC2bRLLCw40-B2Qejmz10YiQo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116736 + LdsInitCVgprs: false + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 826 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16JqQbv0miPuBjuGTO1QiYzu8EIEL_UJAOO0xeIxhcEbs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB3_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 827 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB3_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16nI3ni6kxE02eaceTyJPHQfs7yo8MOR6rn7fXPUu2ngU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA3_NTB1_NTC3_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 828 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA3_NTB1_NTC3_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16831DzVdHpcAfN06nMnpuZxaklAEKQXJeyS_IeVQlloc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA1_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 24 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 24 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 829 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA1_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1PcicnN4rhPttnOPnqCBtYuhygrtG2QIv9rps9xbG1-w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 830 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA7_NTB2_NTC2_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16zFtrwhsejERtXD8XH0g2fVim3OqNCT3BFOFeYtWolT4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA5_NTB3_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110592 + LdsInitCVgprs: false + LdsNumBytes: 110592 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 831 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA5_NTB3_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI163EUBhVCu7YU8YiB7B3wxOFkCvzvOOK-aaTXEsimCDG0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB3_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 47616 + LdsInitCVgprs: false + LdsNumBytes: 47616 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 30720 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 47616 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 832 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB3_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16a-E-7tb-ZO9KJftGvW0wAT7_6vQqw3M4FpLKjRsuatA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB3_NTC5_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59392 + LdsInitCVgprs: false + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 833 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB3_NTC5_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x128x64_MI16IFqTcGgWxKaegNxP_uRujs8TxMJusMTslwWMuxlGTNk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT10_2_MO40_NTn1_NTA0_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41472 + LdsInitCVgprs: false + LdsNumBytes: 41472 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41472 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 2] + MIWaveTileA: 10 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 834 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT10_2_MO40_NTn1_NTA0_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 2 + ThreadTileA: 40 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16Lf7yMV7SfSJQN_Rq9dH2jSQ03y3QAAOcvNPbPiBs8gU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 835 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI164h-9S6LBBQbawdsG_igSxm0hlDyJtLl8drjlKCn9Sbs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 836 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16rRMDIrqhG8IEfbyOeh_QKMWXhQ7OssLMSua0QphqBuA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 107776 + LdsInitCVgprs: false + LdsNumBytes: 107776 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 837 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT8x512x32_MI4x4xwGexg6NUAYSiNzq6JH2hfgr3rNJK4soBJJB-MXGLwqI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT8x512x32_MI4x4x16_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB1_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG4_64_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37504 + LdsInitCVgprs: false + LdsNumBytes: 37504 + LdsNumElementsAlignedA: 640 + LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 640 + LdsOffsetB_Blk: 66176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37504 + LdsOffsetMetadata_Blk: 66176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 32 + MFMA_BF16_1K: true + MIArchVgpr: 1 + MIBlock: [4, 4, 4, 16, 1, 16] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 8 + MacroTile1: 512 + MacroTileA: 8 + MacroTileB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 16 + MatrixInstBM: 1 + MatrixInstBN: 16 + MatrixInstK: 4 + MatrixInstM: 4 + MatrixInstN: 4 + MatrixInstruction: [4, 4, 4, 16] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 838 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT8x512x32_MI4x4x16_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB1_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG4_64_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 1 + SubGroup1: 256 + SubGroupA: 1 + SubGroupB: 256 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [4, 64, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x128_MI16x0shvE3W20sfkGsqjkGAyg6djwqaSgPHwUixXD3lRH6I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA3_NTB1_NTC6_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 839 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA3_NTB1_NTC6_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x144x64_MI16KX7iYmMaGYl8nu7g4CVDr0rGfHiYuGHGAZaIlxOe0vg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x144x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA2_NTB5_NTC3_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120064 + LdsInitCVgprs: false + LdsNumBytes: 120064 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 20736 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 144 + MacroTileA: 256 + MacroTileB: 144 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 3 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 8 + NumLoadsB: 18 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 18 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 840 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x144x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA2_NTB5_NTC3_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 9 + ThreadTileA: 16 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16TyXNsJWi7nS-fhS0PXr9_wo25310Nj8ClA6t3ipliVQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 5 + NonTemporalC: 4 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 841 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16Jygg6B3xFpjouJs-J7lNnFqbr_fjHkpVkG7TeVAeGxw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB7_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 842 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB7_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI1669pYsjaG-bhznKUtGHu2lf-bBXpxPXs6Ova8-wIDIt0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB6_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 6 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 843 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB6_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x128_MI16xs7-6yhjj1HNkkgTrM--grrsZfgn_E6vCNGAM1q6_bH0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB1_NTC3_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 844 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB1_NTC3_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x64x32_MI16x1il9kR7gtAdgfECIrzvR6BT10ESmOvAbBH1EZq9McV9o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC1_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 13568 + LdsInitCVgprs: false + LdsNumBytes: 13568 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1152 + LdsOffsetMetadata_Blk: 9344 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 845 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC1_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT384x160x64_MI16R6ufUzTOAsbT90lr9NkBtbosxb-cN2_TH1hLdM-Dexk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB1_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 146688 + LdsInitCVgprs: false + LdsNumBytes: 146688 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73344 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 125568 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52224 + LdsOffsetMetadata_Blk: 125568 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 5] + MIWaveTileA: 12 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 160 + MacroTileA: 384 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 48 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 48 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 846 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB1_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 5 + ThreadTileA: 48 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16wwr4HSZThMg9UFZzWEWSfwJ94L4DnKO_w2bbGpU0T2I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB2_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 847 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB2_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x32_MI16x1CkA53Ue8YbA1i51n1-TuEmeUsso4seYgPmxkN3dKByo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB0_NTC5_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 15360 + LdsInitCVgprs: false + LdsNumBytes: 15360 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 10752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 5 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 848 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB0_NTC5_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xFPsdLYz1dTSwqg2G0kCB_cwzxZZmZnKIf6S528fVuxQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB7_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 849 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB7_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x32_MI16x1pxdxB7UzAAXNDQuUvHLJ9h-cxAxC2V8kba8-ffvW4ms= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC5_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 10240 + LdsInitCVgprs: false + LdsNumBytes: 10240 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 10240 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 5 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 850 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC5_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16x7GJuR3bjr01Bn5Hf-sWoQ6LJVu-AfpXaH9dH5ASE18s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB4_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59392 + LdsInitCVgprs: false + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 851 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB4_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x32_MI16x1DjaUH_vsJx9v9dbE0O3rvNbppHjy8gK0goRMSQdisjE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 5 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 852 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x32_MI16xrNceqmal1vdzvtYc91d52ZnBJgSpU1Rdp4kEJiBIQM8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA1_NTB2_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29056 + LdsInitCVgprs: false + LdsNumBytes: 29056 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 24832 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 24832 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 853 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA1_NTB2_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xk52S4ad-TevDChGWNhPb1yoNNAIq5pQ-fdl5f8rEbQM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC6_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 854 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC6_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x32_MI16x_hS5WClhqnmWkhPlPPC32KhHn-1rAZBvooAf1TMGx8w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 14336 + LdsInitCVgprs: false + LdsNumBytes: 14336 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 14336 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 855 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x128_MI16iVIYZb1BYuBA6lEYwxwvZ7109NzN99TD6TIDLsTQRVQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB1_NTC6_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 856 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB1_NTC6_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16x71aO-s1g_2o51T8HyLV9LYsofU877J6GJdCwBJQpGkQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA1_NTB2_NTC6_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 857 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA1_NTB2_NTC6_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x128_MI16xS2jMbDtQ5_F3P86CkrJHRZTDOo-7OylhPwPhVJb2OL8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA1_NTB1_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110592 + LdsInitCVgprs: false + LdsNumBytes: 110592 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 858 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA1_NTB1_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16BFaR0ZYbZkhwWjT_Ba8i0NCAy6KAmcxHOp6SgAHg2_o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA2_NTB1_NTC5_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 107776 + LdsInitCVgprs: false + LdsNumBytes: 107776 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 859 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA2_NTB1_NTC5_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16KPP61EtqBNTlRJ-I_aAukkPlJtkB7oO8oAgiQ2R0llw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB6_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 6 + NonTemporalC: 7 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 860 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB6_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16Fw_dvYNukHFKOVviLuxEVKNl1KXvcgtWMXYpu_RV9bg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB3_NTC7_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 861 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB3_NTC7_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1fVSXE7Hp2-FCWO4DdLL1bSkdGk5CHnY7g80tz5B_EqA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12480 + LdsInitCVgprs: false + LdsNumBytes: 12480 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 862 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x128_MI16xBe8y_mZMAbiXjK4oCAn0nsuNlixq212gvIUfboQHg4k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC2_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24832 + LdsInitCVgprs: false + LdsNumBytes: 24832 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 863 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC2_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x64_MI16x1pQSIDqCoQx3_Ri79K5ad2lS5jlijd7Sp5JLVMDsb73Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24832 + LdsInitCVgprs: false + LdsNumBytes: 24832 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 864 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x128_MI16xQUi52S7VSdGJDn5Bpm7RKSM5k6RLZWm__jwH2YQ0HgQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24832 + LdsInitCVgprs: false + LdsNumBytes: 24832 + LdsNumElementsAlignedA: 4224 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4224 + LdsOffsetB_Blk: 20608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4224 + LdsOffsetMetadata_Blk: 20608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 865 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x320x32_MI16PMVTS3yNba5IV3cNNVe8WrfKpppfUC2VxgXy7Itzw2Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x320x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA2_NTB3_NTC5_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63104 + LdsInitCVgprs: false + LdsNumBytes: 63104 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 320 + MacroTileA: 128 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 866 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x320x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA2_NTB3_NTC5_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 64 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x64_MI16x1z-qFNfK0ass6s-mBeCbn2T-UhuXzV92gtCkJ2GTXgUE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12480 + LdsInitCVgprs: false + LdsNumBytes: 12480 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2176 + LdsOffsetMetadata_Blk: 10368 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 867 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xYSECTInVbJDnbiylgv7Bz8iWJfwZ9tNFBp-qDs3Q8Zg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC3_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 + LdsInitCVgprs: false + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 3 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 868 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC3_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x224x64_MI16mkECGi4PfVq_DSg4bBQNpVTBWlPse1LKEVgkFINTS_M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB6_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63488 + LdsInitCVgprs: false + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 7] + MIWaveTileA: 6 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 224 + MacroTileA: 192 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 168 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 6 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 869 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB6_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 7 + ThreadTileA: 24 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x288x64_MI16sEKHjZV32z0wCgP-Ck4vxRRsykPsLTvAc2SuYHI8fYk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA3_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 158720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 158720 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 9] + MIWaveTileA: 6 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 288 + MacroTileA: 192 + MacroTileB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 216 + NumGlobalWriteVectorsPerThread: 108 + NumLoadsA: 6 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 9 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 870 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA3_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 9 + ThreadTileA: 24 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16YJhfH7jHUovMMi6uEf7-a6s9RhmceVmNYR2wLw7yChQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA1_NTB3_NTC7_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 77312 + LdsInitCVgprs: false + LdsNumBytes: 77312 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 51200 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 157184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 77312 + LdsOffsetMetadata_Blk: 157184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 5] + MIWaveTileA: 12 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 320 + MacroTileA: 192 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 871 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA1_NTB3_NTC7_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 5 + ThreadTileA: 48 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI161NTL0auO2BZyLodIslMIR1DwO0ELH4sdtwc7edrEZa4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60928 + LdsInitCVgprs: false + LdsNumBytes: 60928 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 872 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16RnRjOTmkiumEFZRZUcGRxBp2kZmZrxMRoo6KyHD2Tbw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA2_NTB5_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 77312 + LdsInitCVgprs: false + LdsNumBytes: 77312 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 51200 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 157184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 77312 + LdsOffsetMetadata_Blk: 157184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 5] + MIWaveTileA: 12 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 320 + MacroTileA: 192 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 4 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 24 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 24 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 873 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA2_NTB5_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 5 + ThreadTileA: 48 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16nj2MHcRCzzxakd1nPd929ZQwVRlEwGsPVIXZzv0nzwo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA2_NTB3_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 25344 + LdsNumElementsAlignedB: 42240 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 25344 + LdsOffsetB_Blk: 92928 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25344 + LdsOffsetMetadata_Blk: 92928 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 5] + MIWaveTileA: 12 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 320 + MacroTileA: 192 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 874 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA2_NTB3_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 5 + ThreadTileA: 48 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16x0I8NwzTiKxiFywgzFRbmksUFYxRYGXCVu8nZ_ZarTes= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC1_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 875 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC1_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16LdT3Ok_31-UGAeeLmLhm7J_PfRw3U1qnSbzalcKhsDE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 6 + NonTemporalC: 5 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 876 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16WKRGsmaOqbeAJEIx4DocFL4x5TrEqHvywiHDMcQwGfQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB5_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 5 + NonTemporalC: 3 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 877 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB5_NTC3_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16XDkV1WWdgWCp28uTkfX-ZMj-6qQkpDx3PqWeyjXkBTU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 6 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 32 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 878 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16LONw0lPkipYqJjxTghhorV3ImHRYpxKO0mmzF67V2QE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB6_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 879 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB6_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI161P-AfpKbDA0dd-RJGocfqkcL14EjTPhFXXFVBZp9C2w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 7 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 880 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16UL-0jXrB-iJUBJ6z2nuNkq7qNwG4Umv45OTrMUHD_Sg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA3_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 16] + MIWaveTileA: 4 + MIWaveTileB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 881 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA3_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI32mMYwjHl1gVR-fkug0zGR3LxCuZZ-9gAO-iqfZWSDU6A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 882 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xZxuiwjUCkloWJns9y2HurS-nLYzTfpPUANwjQ59sWfs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 883 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x256_MI16xBXEA7QLtW3O9ZsvGfaJqMm5Y9FOpoF3EFoggHLq7JtM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 884 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x32_MI16YyEvEgPf2u14k5vjILjzibCKSBhwfrPuXLaNFW_Wrns= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB3_NTC4_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100864 + LdsInitCVgprs: false + LdsNumBytes: 100864 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 885 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB3_NTC4_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI163pH1vLow3EsN2uQ-opRuYXi1hCiREgsNFKkSuy5vILk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 886 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x128_MI16mPtFI8cyGaBSop0zx9aHg6nxxsRXMm0hN1_8JohpoXk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB1_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117248 + LdsInitCVgprs: false + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 887 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB1_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16H5rgVvmhoTBq8gkcv1oEUSoSiFuvEx01LilKaZ8NaB4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 888 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16DiVvEf0br81UZSidSMu5XE0YOZ0sVEbKyte4Rsents8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 889 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI328z7zlrPZoNFWjRgoyWQ6B8tO670SBJPW6w9Uj7PbVAE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA1_NTB3_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 136192 + LdsInitCVgprs: false + LdsNumBytes: 136192 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 68096 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 890 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA1_NTB3_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 128 + ThreadTile1: 2 + ThreadTileA: 128 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI167_rpY4LIMTOrIm1LuMEtZDMEAg2DwRP8EJVrJOoqzGE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC7_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 891 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC7_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16y2-BM5sQUywvo0UBxMHcKo0NNjayeizkxV9zqVnpXFc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 892 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x128_MI16xms5UVBAC-5pKvf1spjSRSVxkYaWKIBiWqZuQ2LImnzc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 9216 + LdsInitCVgprs: false + LdsNumBytes: 9216 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 893 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x128_MI16xiQoTBpOt_KCP21uwLsp7NPSJR81sV7RqxGZk5C2qlX4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC2_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 9216 + LdsInitCVgprs: false + LdsNumBytes: 9216 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 7 + NonTemporalC: 2 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 894 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC2_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16ppuM5ImiGAl39QcT8TshVstt5olMsWZONgDjC47gkgQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 895 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16TRkaxmxgVEaCXigbsd4lSs-2bWmBfCiCaTIkhvyXtjQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA6_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 896 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA6_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16DLK_b4BqmIOviXWtqdwvDutysymqgIRyfJqzydL2m1M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA5_NTB0_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 897 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA5_NTB0_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16wNIN879MSkh2qRAl8HYpv_Rpz5N83A-i1GYbj0niHMM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA5_NTB3_NTC3_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 3 + NonTemporalC: 3 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 24 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 24 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 898 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA5_NTB3_NTC3_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x32_MI321okaBOV1fu9vWQNAk5Oq47xiUzXFyw7FwdjXTK43yS8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 + LdsInitCVgprs: false + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 46592 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 46592 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 899 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 2 + ThreadTileA: 96 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 3 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI168VDRIrx23PTuC6lhze-A-0APG55SFM3_sqQjioo3is8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB2_NTC4_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 900 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB2_NTC4_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16x_6WVJaqJvDAy3oUIuceulyLwFp9Nt9m6fESDfdDIEk0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 901 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xiserYeQLI7kbxfk2Liw0IzS1NzNpjEwTQqu2GknzFiU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27136 + LdsInitCVgprs: false + LdsNumBytes: 27136 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 10240 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 902 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16f5aXipoZHEFnpnwprApAAp-vVGXpbr5VjryeZd_ZZYw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 903 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x128x32_MI32sDHpLp_qRcXmLqvyJAA4COKfyZCAdYezAEba9LM73SU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x32_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB0_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 128 + MacroTileA: 512 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 5 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 904 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x32_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB0_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 128 + ThreadTile1: 2 + ThreadTileA: 128 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x16x512_MI16xHav13Ay7faFoeK4BhJe8UYA01ZshuvwPXlxS_wY_B-Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x512_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84480 + LdsInitCVgprs: false + LdsNumBytes: 84480 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84480 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 16 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 6 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 905 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x512_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16q8NEDpD13SSNUu5bllAIXsJO-kShQHfpJNFTTz1XaoQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 906 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xWRRxsqJ7jMcN3RhwdtlcrN_appCXI3uzm_0C6uSkDMk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB2_NTC3_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 907 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB2_NTC3_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x512_MI16xUxsmMWfC7lMbayJUFJtt3uhGmPoX5bqgzvqCGGL3ytI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 2 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 908 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x512_MI16xRRDqwALEE4LHHM1M4ceEUH0APTRcUTuF9PVaZJmnhT0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 909 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x32x512_MI16xW4SabINezihuadnkWBODMFyHK6ApsCWB1xnbnvs0qI4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115712 + LdsInitCVgprs: false + LdsNumBytes: 115712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 910 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x32x128_MI16xizr7Q8_nCrHLOtGHAAVLDiIPvJtsXUwahODl32ZXFqU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_1_MO40_NTn1_NTA4_NTB2_NTC5_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 25344 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25344 + LdsOffsetB_Blk: 90880 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25344 + LdsOffsetMetadata_Blk: 90880 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 911 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x32x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_1_MO40_NTn1_NTA4_NTB2_NTC5_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x16x256_MI16x6CocHSVfjUtMgJadpMnsKR32nEDOIAmuC97s9-SGqFk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 912 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB2_NTC3_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x32x64_MI16xAL4LlplqX-lASFEbhgTbShC-GDPkvccQf2jTTz-nVfg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x32x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA3_NTB3_NTC5_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38912 + LdsInitCVgprs: false + LdsNumBytes: 38912 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38912 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 913 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x32x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA3_NTB3_NTC5_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x128x64_MI16yj-r8sQkpPX2ARVU-HAhTbcq98ZTuNBX4v6RVEpYxmA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110080 + LdsInitCVgprs: false + LdsNumBytes: 110080 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 24 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 24 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 914 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16UY7JybfNYsXbB6ePwJELigTUPhCwSjLgdXAzMyu2h2o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA3_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 915 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA3_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x192x64_MI16YPgxiAM3E1SkUV79IztVOV85qe_ULaWDpoyyCH7fevw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_3_MO40_NTn1_NTA0_NTB1_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113920 + LdsInitCVgprs: false + LdsNumBytes: 113920 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 3] + MIWaveTileA: 10 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 192 + MacroTileA: 160 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 20 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 20 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 916 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_3_MO40_NTn1_NTA0_NTB1_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 3 + ThreadTileA: 40 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Yb6vs2wMAek_0gQXVuxZgWD4oIrhpoUVVgQWjM-7NSU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA2_NTB1_NTC0_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 134144 + LdsInitCVgprs: false + LdsNumBytes: 134144 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 917 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA2_NTB1_NTC0_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x128_MI16NvJYM5TYlxX33xhRC3rYojBFPcYDUmQTKy-xgGL2zfc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA5_NTB1_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 918 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA5_NTB1_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI32BUhgY5-OmEvd0k2y390LwC1WKTzoJmnEmMybvkNcvGg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB3_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116736 + LdsInitCVgprs: false + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 919 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB3_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT80x128x128_MI16CH6-2Ow9x-ZJRcCwf3Spct6EpgpY9UjXCKmQfd8TKYI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 128 + MacroTileA: 80 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 920 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT48x32x256_MI16xNp06_mGTrXDLa_B2FY9_xiHpNe0EWyYQPcSwC-WZ3Qo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA6_NTB3_NTC1_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 107776 + LdsInitCVgprs: false + LdsNumBytes: 107776 + LdsNumElementsAlignedA: 25344 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25344 + LdsOffsetB_Blk: 90880 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25344 + LdsOffsetMetadata_Blk: 90880 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 32 + MacroTileA: 48 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 12 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 921 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA6_NTB3_NTC1_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16lvReLjX23li0zKLF0BjlXHUVB4dqsAX9tBGKFbFUMFU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA3_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 125440 + LdsInitCVgprs: false + LdsNumBytes: 125440 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 24 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 24 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 922 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA3_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Q0iQTv2siYkhhsH27R4bwiDBwuYbcFbclooq76DRSmQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 923 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xVOcz9sQMnRhNkFWae_nQBMHW5856NLlj0IGASkeA7mU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 924 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC3_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xg9ZeL_CtEIKuLm_H3DZ3F4HiMb4AoPozk4YDq2Piw84= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC2_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 2 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 925 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC2_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x512_MI16xf8Ie6tqcBuD2Jujp65Vo-v0ikwjRIIuuqJfqvYMHnLI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 3 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 926 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x512_MI16xJmNUwCek8yAJmaYGDUGFMiPxUpHP50QvJWrzpjcN2QM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 927 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xumwbQLBsbTuiz-xmysx9cyRtIiz8u-ZdtZLjBu_bjSk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17408 + LdsInitCVgprs: false + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 928 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x16x256_MI16xctuAv9IQDoLZTgUSJDYmiBJF5KfirHnQYPm1jNLEZV4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 50688 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 50688 + LdsOffsetB_Blk: 116224 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 116224 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 16 + MacroTileA: 96 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 24 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 24 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 929 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x64_MI16Te1XcBfAkTj_Si2BZ_O928OQ0H5M89MFZVckX2T9Ct0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120832 + LdsInitCVgprs: false + LdsNumBytes: 120832 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 930 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xKU2-gNLRtPrqTDeoW6Ox1NxZrNH8LrYdSR1cZojayJQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA3_NTB4_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 4 + NonTemporalC: 7 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 931 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA3_NTB4_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x384x64_MI16kw4DR3bscLfN9kE6MIrDpfvGwO5OsgS18hHb5_DgRlA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x384x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB7_NTC6_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 50688 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 84480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 84480 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 7 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 932 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x384x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB7_NTC6_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x512x64_MI16Qzt3xLpEn8XX7WzJUeYcLb3pNOyn3wp2aHDtperEteo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84480 + LdsInitCVgprs: false + LdsNumBytes: 84480 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 147968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84480 + LdsOffsetMetadata_Blk: 147968 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 512 + MacroTileA: 128 + MacroTileB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 5 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 933 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT144x448x64_MI16JmldHSP3XTQNWsmOT-Ia1CQdNXcim5Rvy9qZPhyc0ZQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT144x448x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT9_7_MO40_NTn1_NTA2_NTB7_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 159744 + LdsInitCVgprs: false + LdsNumBytes: 159744 + LdsNumElementsAlignedA: 20736 + LdsNumElementsAlignedB: 59136 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 79872 + LdsOffsetB: 20736 + LdsOffsetB_Blk: 100608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 20736 + LdsOffsetMetadata_Blk: 100608 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [9, 7] + MIWaveTileA: 9 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 144 + MacroTile1: 448 + MacroTileA: 144 + MacroTileB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 252 + NumGlobalWriteVectorsPerThread: 252 + NumLoadsA: 18 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 18 + NumLoadsPerpendicularB: 14 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 934 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT144x448x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT9_7_MO40_NTn1_NTA2_NTB7_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 36 + ThreadTile1: 7 + ThreadTileA: 36 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI1647OpdIdLsPjkx5JGNkLWyaZgGxs4t4kA19t3ZHCMmwc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 935 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16fQVQR5gcnScAmEP6ehXKtAM_-znIoCb6q9SZSpafuYI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59392 + LdsInitCVgprs: false + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 936 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 64 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16uPL4WiDVSfjrZh1Qou0vhKt44u-gQhgLc59de4dj6Ys= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB2_NTC6_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 937 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB2_NTC6_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16RHQsbTBHWgnm-nubTl7uKOe4m19tjMDivayCq2jENWg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 134144 + LdsInitCVgprs: false + LdsNumBytes: 134144 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 938 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB3_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16nJXuNiaEvLyPIwe49PAbt3uaW6BX0758KZ4tlFZYNy0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 939 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB1_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI162HlKNw6yd-3bjBU1NA3fCbhfgdU76LtwrqYq5hCFoYE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 940 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16LLVRSYLO0ehe7plK3eqGuv5AKPIUn4lEFOgWK2IcQaI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_12_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110080 + LdsInitCVgprs: false + LdsNumBytes: 110080 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 12] + MIWaveTileA: 2 + MIWaveTileB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 941 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_12_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 12 + ThreadTileA: 8 + ThreadTileB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16dc1RWq5OI8ZYITl452_XtceKzZW8LbRsM-orvD1QNOk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB5_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 5 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 942 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB5_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16OPWJ2Vhbc174MGrC2nliEfLa8lucgLmzdcw90jlLzRM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA2_NTB7_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122368 + LdsInitCVgprs: false + LdsNumBytes: 122368 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 20 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 20 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 943 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA2_NTB7_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16x7VuXIvkVBdaZXVrBpv-ptK0ippKvTvEO8ANryNDeJlc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB2_NTC5_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 944 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB2_NTC5_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x96x128_MI16EwRs6OAL3-4d8o-I461H8f5l3jKJLNSfGEWNZTEh1Rk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 162816 + LdsInitCVgprs: false + LdsNumBytes: 162816 + LdsNumElementsAlignedA: 55296 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 81408 + LdsOffsetB: 55296 + LdsOffsetB_Blk: 136704 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 136704 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [3, 6] + MIWaveTileA: 3 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 96 + MacroTileA: 192 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 945 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 6 + ThreadTileA: 12 + ThreadTileB: 6 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI32hKcHKQ8o49ASNY6zmjNreC_Q05A1UJzz-nd9LNa076Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 2 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 946 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC2_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI32uyddi-_Ova4MSaGan0jTebbhqveeajquDhTi1TkMuZk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA1_NTB3_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 134144 + LdsInitCVgprs: false + LdsNumBytes: 134144 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 947 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA1_NTB3_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x128_MI16fX1QLDzfCT3eH0e3dBdSNXN2QL17b_7okYOLyAb875k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 5 + NonTemporalC: 3 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 948 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16qqehBsEZr-PXGp_8_xa2MYaLCdnLBRh2rL6ItyCL5Xg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA3_NTB5_NTC3_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67072 + LdsInitCVgprs: false + LdsNumBytes: 67072 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 165888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67072 + LdsOffsetMetadata_Blk: 165888 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 5 + NonTemporalC: 3 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 949 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA3_NTB5_NTC3_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI16CZh-cb17lQfltxiK-93MVQ0BdXoCi3XDe8XUoqTGd_Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA2_NTB2_NTC1_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 950 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA2_NTB2_NTC1_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16tIqhwvntux5qIxAZw2mfP-BUQi-NomTke_pd_eoxJcY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100864 + LdsInitCVgprs: false + LdsNumBytes: 100864 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 951 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Sr0lzEjPFUwdbm92eaiotYuMYY1y2wzlUPpDAwL44-o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB5_NTC3_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67072 + LdsInitCVgprs: false + LdsNumBytes: 67072 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 165888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67072 + LdsOffsetMetadata_Blk: 165888 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 5 + NonTemporalC: 3 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 952 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB5_NTC3_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16AslD6pXW7JSF6dZ1qn1k9X58w5Ll3Av-KWUeReqv7ks= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB0_NTC6_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104064 + LdsInitCVgprs: false + LdsNumBytes: 104064 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 16 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 953 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB0_NTC6_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI166Fyl5G2lYgtbZ9K5E4W-XtjRtkkUMRv8UUUTyMyvF8I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC5_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 954 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC5_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16n0bQkRZ6qxeoGAdUu7GJQfmJ5xS0chekr0sv0pJVQ6o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 955 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16hp2ricknUsKMxThnp_RJuqI1ZmNgVw5ACFCO95WQUN4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 956 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32alykEUyYVzzxaIZSLyvFAAE0grgJ-xPZkdbVRFEkzx0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB1_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 + LdsInitCVgprs: false + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 957 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB1_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 128 + ThreadTile1: 1 + ThreadTileA: 128 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x64_MI32x3izqHgbM-d6Up_yu3myD76PCC4UCCwwHTLbQ9b8QsUu0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49792 + LdsInitCVgprs: false + LdsNumBytes: 49792 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 41088 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8320 + LdsOffsetMetadata_Blk: 41088 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 958 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB2_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x128x64_MI32xpjCEuAtTyUsr6Zedfsbzg9qjaO-ZRBdlA5Jm10I4ZC8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 959 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI32kZnI0mYofuEpwrNzqvlU5UpA03orT8bQpcZIcGeQYGc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 960 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI161IkOm794th9xtmf-mkwBuO231HubzKHTyLu7Tu4WjhE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA0_NTB1_NTC2_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 103552 + LdsInitCVgprs: false + LdsNumBytes: 103552 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 961 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA0_NTB1_NTC2_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1kAtWydlVVtA2mSO1ULYhEPtiBvOdTizaO9yfe5372HA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 962 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1d9s27EZuaxdvWTFGKd32hMyof8qLlVr_EIHGJTWt2gU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA3_NTB0_NTC0_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 159744 + LdsInitCVgprs: false + LdsNumBytes: 159744 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 79872 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 113664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 113664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 963 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA3_NTB0_NTC0_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI166fWKovV9hpGO4qaYPlsZA2HxVyg0NJO0rymRtakR8pY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB3_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 964 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB3_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI167pMAm5zbyGdER3KFEA27FXZCXlMviGTf4r_9_1FgSjc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB2_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 965 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB2_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI32NBLTHx_Gpoq8atMJtHhkIJxDKCjmWWFkjAdWqZEmCUQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 966 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI331VPimmorTQ8AYzGf1NiwfbagqGmyo8CLAfBEjaqiyM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA2_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 43520 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 113152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 113152 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 967 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA2_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16p5RxYLrKEnbA-HIE4lUtKZzZTxM_IdhaDXFJXp6hszs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 103552 + LdsInitCVgprs: false + LdsNumBytes: 103552 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 968 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16ati3xugoO08PmQml2qZEayshFx1-5FgxjyXEFaYYolM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB1_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 969 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB1_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1rflT5zVO8AZ_i0y-GDDJCkoXv07gCcOniLc9DpvSd_4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 970 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1YXqSvjevjQLvpyt_2-23TZqpzBG81diWO1-fnvsF4p8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB7_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 7 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 971 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB7_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3u2tvE22ubVlGLexgnY5yHusorOdC_u5-aXgigV89i1c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB5_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 4 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 972 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB5_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16mzQgXvkDqVGeE12hDFqEfYTgFxPQD8C0vOou3AMGIiE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA2_NTB7_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 973 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA2_NTB7_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16f748OZlQOumtIueQrxHYmlXNlyUnnD4XzYf2Bm5yU4k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA2_NTB7_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 6 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 974 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA2_NTB7_NTC6_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI169TqeNJ3f-dh_Q0utTQqpI7QgdML5AZVpV3NYhEbNG5A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB4_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 6 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 975 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB4_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x288x64_MI16gBjiLcqzSdUOIqOP-6Lctg5OStx9zfcjJsmowsuRFdc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA1_NTB6_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 129024 + LdsInitCVgprs: false + LdsNumBytes: 129024 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 288 + MacroTileA: 128 + MacroTileB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 6 + NonTemporalC: 4 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 4 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 9 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 976 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA1_NTB6_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 9 + ThreadTileA: 16 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16N7jXF8_Qf0aNxA7L42Afhh3P668h0pApiapyDg0ijN8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104064 + LdsInitCVgprs: false + LdsNumBytes: 104064 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 16 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 977 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB2_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1gdbabCK4Cbk9dzcNz5MESH6Gr-pKALrY10oVBuGVbMQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 159744 + LdsInitCVgprs: false + LdsNumBytes: 159744 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 79872 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 113664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 113664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 978 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1A0XIwQXFu-OwTM6Pph_d6vN_vE-SazquoaubqUZ6nx4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 159744 + LdsInitCVgprs: false + LdsNumBytes: 159744 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 79872 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 113664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 113664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 979 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16ciosobcQH4ypKgT0_et38BzWL178BbBE8UBJCt-UCSc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA3_NTB7_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 4 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 980 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA3_NTB7_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16EtL1yGW5yKYdxiu0Uq_Vhn2S_4vGR20I70CKzgpDFiI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB4_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57856 + LdsInitCVgprs: false + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 5 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 981 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB4_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16VHPGU0mrqXBy8pYIryFrS1YaLKj2KVsHrwljqR3OZG4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA3_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 982 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA3_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x96x64_MI16xgH-uNasQNnFkPd_mYchLnJZB38K2VRlEuBoijC6t7hQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA1_NTB4_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 112000 + LdsInitCVgprs: false + LdsNumBytes: 112000 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 4 + NonTemporalC: 7 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 983 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA1_NTB4_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16C3wndprfgaJVK5dROkGkV3eLKOnVhaJmzYWboVPNjAk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 134144 + LdsInitCVgprs: false + LdsNumBytes: 134144 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 984 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16IHgIlbUBAh-oojhpF9EcJ-8MdIl6f-EF1K5BOtbiZv8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA2_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 112000 + LdsInitCVgprs: false + LdsNumBytes: 112000 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 985 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA2_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16WEGE0_8tn9YpH6Yj31JWn7UWKY9tJMWNlzgHut1qGrs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 112512 + LdsInitCVgprs: false + LdsNumBytes: 112512 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 16 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 986 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16TOtr1qGMLBNQmRW-tQKRjnkljLO4s0f7ail8-SeQfNE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA3_NTB7_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 987 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA3_NTB7_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16fJg9VMUMnDvFlaadft4fFJBR-AZ1KOiQ3dSFORyhhu8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 988 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16ElsNwi58ewsT3E5tO-d6OS5JheuzUEoDPOeBnRLE0Sk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB5_NTC0_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 5 + NonTemporalC: 0 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 989 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB5_NTC0_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16_gkn1MbHNokE7gxDIgmnU285d2stkvVABZGxoxsmzsw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB6_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117248 + LdsInitCVgprs: false + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 32 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 990 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB6_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16krsgrbs75pydUq9sNfBGF5cwoLvZWAZSo8LLuYaH2g8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB7_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117248 + LdsInitCVgprs: false + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 32 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 991 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB7_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Z4mMoWpvvYfd28olgkYANQt7aA3dTeeCxqYS0kr75oE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA3_NTB5_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 5 + NonTemporalC: 6 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 992 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA3_NTB5_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI32nmGZ2oVA-8Jn23FdQrljhHKbRG58SGmQzbu9CyyS8NU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 4 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 993 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI1695c3DsOHZbdpyygb9R_j_oeZk16dsu5191x0V4IRQUI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 994 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16ACeXLyHEcgiJAoxHPQR1J10_cctGQw9JjXi0jAkoAlA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB5_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 5 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 995 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB5_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16I6pIOdQfdhrAh_Ct90BPYyv2Ys5r9nVOqCfHw4PmURE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 996 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI160Y1ykUqDNlgNI3WZEFLn1XS5x0MQd0hB7oTscWM7PYQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB5_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 5 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 997 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB5_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32I_Z0J9kV32gwjW-5P043f5A9XM-6LlLmeEeRzLC3LME= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB6_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117248 + LdsInitCVgprs: false + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 4 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 998 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB6_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32nFZoUwk3FHsU8Iz8ivk3xvSJGQDd0z6LBp721gWwwi4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB7_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117248 + LdsInitCVgprs: false + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 999 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB7_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16pVlEkTrKmft7dNlMWD0Ftam1zDhIu_5PcdHIezGiENE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1000 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16H7i9rgQAY-Hmn-PxHxFHj01qxvcZOQojUwk0rK5auCM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB7_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 7 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1001 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB7_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16At5zYKocGNc0tC84h2Ca8-f8RPzN85vXJiXN0gxmcHI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 2 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1002 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16svLLW3Wp5iCzbCDvMSBg--Q7EUrsgPyAVe_YYOHVtq8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB5_NTC3_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 3 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1003 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB5_NTC3_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16BqfWJb9IrydrMmIImYE7syAC7pjucPL3a3-FFE89DxI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1004 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16YT_go3YPrH3Z0c4g5-iW9-4P9cNreauSDieOyjiGT2o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB7_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67072 + LdsInitCVgprs: false + LdsNumBytes: 67072 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 165888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67072 + LdsOffsetMetadata_Blk: 165888 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 7 + NonTemporalC: 7 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1005 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB7_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16EYc6YLLTfTALKcv_uSgC9T3VVBS6RDQti9l8AqlphA8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB5_NTC0_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 5 + NonTemporalC: 0 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1006 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB5_NTC0_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16cSpaXzVJf7-fhy2HNPfwWWcmfyqd6n6NBhQOfp3rQME= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB6_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 6 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1007 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB6_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16g2ho-xQD4baNle8vkD22b-jgLJQuXr4IH_t5S3ZpwXY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA1_NTB6_NTC3_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 16] + MIWaveTileA: 4 + MIWaveTileB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 6 + NonTemporalC: 3 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1008 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA1_NTB6_NTC3_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16s9bUjn0IcaFSiVUlrZ9EMKD1QVEVRfZVXI2lSdtb8pY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 4] + MIWaveTileA: 16 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 5 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1009 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI161-5EWi-_enLvAVYrVHpCQaPgqL9xF0xXjVsOCQ-oouo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB7_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 4] + MIWaveTileA: 16 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1010 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB7_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM2_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16Yg3CGvJoR66gSwBI8RhiSxLWsQ5RxFKb1LbZoBv6LyE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 4 + NonTemporalC: 3 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1011 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16VNB3XaOhh2-NR1XK6ZRAuPEEeUPSbQp9a8A9aTqcUwM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA1_NTB4_NTC1_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 16] + MIWaveTileA: 4 + MIWaveTileB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1012 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA1_NTB4_NTC1_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16SSiNEW16nYcsffeycDFQkvui5kpDWe-X_zn_bhPr9ig= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_12_MO40_NTn1_NTA2_NTB6_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60928 + LdsInitCVgprs: false + LdsNumBytes: 60928 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 12] + MIWaveTileA: 4 + MIWaveTileB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 6 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1013 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_12_MO40_NTn1_NTA2_NTB6_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 12 + ThreadTileA: 16 + ThreadTileB: 12 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16mYzq1ZgGOir3WNse9Qh17J43TFE_95PscgKGYfll0hE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1014 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16dfcXFDA8SLUX-vrGy7rbc8Q4fMWlcRwN_wfQyscMJfU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB7_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1015 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB7_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16zYydvC1cxtIV2-tn0CB4-WHOsq99l20ygGB_zv7hJIo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB7_NTC1_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1016 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB7_NTC1_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16MW9U_2TPrZeYH5ppNd7JzZ-alNEiIAH_s3i0V-M_-3E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB6_NTC3_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67072 + LdsInitCVgprs: false + LdsNumBytes: 67072 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 165888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67072 + LdsOffsetMetadata_Blk: 165888 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 6 + NonTemporalC: 3 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1017 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB6_NTC3_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16ns-fA46zepLUpH1jert-cTWxNUIJah040YS1-eGNBds= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1018 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16Rha03v4MhMNb6jRDsDBiP1h1rTlTKthb7HiWAG9Arb0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1019 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16Tdz_yQM1ZwK4kR7vHKsmLRZdrnCpmmXtZDr7tmUSq8A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA4_NTB0_NTC7_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104064 + LdsInitCVgprs: false + LdsNumBytes: 104064 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 16 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1020 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA4_NTB0_NTC7_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3dLSbUx9BEOTzqeT_H8ocDARlxEx7Dgff2ylxd52Ls_M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 136192 + LdsInitCVgprs: false + LdsNumBytes: 136192 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 68096 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1021 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA3_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 1 + ThreadTileA: 64 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16ZgNaKnH_H8ts65iOQcCcEIRBXR3CLNqivzkc2_gpqqA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB1_NTC0_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104064 + LdsInitCVgprs: false + LdsNumBytes: 104064 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 16 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1022 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB1_NTC0_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI161MgVG7V2_jRTY_t_zmbX6GCxZrZN-OQf9f4PZ93N0FY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA2_NTB4_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 4 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1023 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA2_NTB4_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1a9La1t4bkGflOu9rs7esrQg1R2kdM8mOtN2cokKsxxM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB2_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 159744 + LdsInitCVgprs: false + LdsNumBytes: 159744 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 79872 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 113664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 113664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1024 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB2_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI165Ph5FzTBInU7wlUtI1XvBDV5Vo5H7CclCV448dKEBjk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA3_NTB3_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110592 + LdsInitCVgprs: false + LdsNumBytes: 110592 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1025 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA3_NTB3_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16nHFuOFllFsGK0KFewYZ4htEdbnVYLgfLbmrPe_PQ2wE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB3_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 3 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1026 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB3_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16EFOjhm0P5myZD2ZUMYoJl8PTasXYyawjrjWCNZamLSk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA3_NTB0_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110592 + LdsInitCVgprs: false + LdsNumBytes: 110592 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1027 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA3_NTB0_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1tnbzwD61gJ8-eEJNYtMNbLQ5OG5mlyk0onu9l1wLgKs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA1_NTB1_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 159744 + LdsInitCVgprs: false + LdsNumBytes: 159744 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 79872 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 113664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 113664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1028 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA1_NTB1_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x288x64_MI16b-0hvop7nM8pniQgukyTovYnCjkagpUYftIy3lyOPEg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA2_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 129024 + LdsInitCVgprs: false + LdsNumBytes: 129024 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 288 + MacroTileA: 128 + MacroTileB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 4 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 9 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1029 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA2_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 9 + ThreadTileA: 16 + ThreadTileB: 9 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1LuOxONfMHSlq_UjUT9xBppI-AnllztoOTZImOhMcZQQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 159744 + LdsInitCVgprs: false + LdsNumBytes: 159744 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 79872 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 113664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 113664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1030 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16u2x9kOu0hKWVZugV0AqUr0nPgwFHsYz3EYunVxdL5Qc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA3_NTB3_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1031 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA3_NTB3_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16wcWGExQ1bTDNkVseRcVXig4uUydDs-9d2wRriz03UdY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB1_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1032 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB1_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16NZL-75Ki7XAsNLaTZihU08lWBBhRtgHjnaUOLckBuQI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1033 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI16AyKb6vtbmqKKHAvrJyFKhPUETr4Ol8uWmDuvH1-1B3c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1034 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16kDiDPfYefDawU7pB4UeOMRV40AcW7hmQKVxJkDN9xHg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB0_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1035 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB0_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI165tbp6mPoSU2BMK3wSMdm_VkVQsYaVhsiCr3oS3quvJg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 5 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1036 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI160kDl7yaU9gYvW2vW2lN_qXwOqy2Lr0Qto6vNjcx7UNA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA2_NTB1_NTC6_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1037 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA2_NTB1_NTC6_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x256x64_MI16XoNfbiF6htIEwHHGVe6564vtK9vzaVwuBM8Vr_ZZsEg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA1_NTB2_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 21120 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 21120 + LdsOffsetB_Blk: 86656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21120 + LdsOffsetMetadata_Blk: 86656 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 8] + MIWaveTileA: 5 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1038 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA1_NTB2_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 8 + ThreadTileA: 20 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x320x64_MI16Sg-N8yW_Z1bcX3qC7j2-LFytSJN-Mus_91wMBA5vIK8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x320x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 42240 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 320 + MacroTileA: 128 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1039 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x320x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16zEfK9g7EM94jFa32eAXtrbKTIojORHQ115zy-i3xxP8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB5_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 5 + NonTemporalC: 7 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1040 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB5_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16H9YoRVYXK40Eq2iR_4OzhYcMfwzyeTxbpmZETsE2fiU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB4_NTC7_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 7 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1041 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB4_NTC7_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI329SlG48-ea0-kImal4lo2_OisWj-a-6M71h-ef_F18f8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1042 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x320x64_MI16_qIxqjmEkQ0ngAkEKOzmMX3i6qKwWQILcBHveDPCXh4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x320x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB6_NTC5_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 42240 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 320 + MacroTileA: 128 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 6 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1043 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x320x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB6_NTC5_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16QK03hBnQ78u4OPWFbqStxrwJ2M-jC8ammqO3Ik2XybI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1044 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB1_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16_6rUGns54P4iRm9x7g6oqi5vQPI36h5UtnBDtmSe6sk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB1_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1045 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB1_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16CnaJJAsUKAIiiJ70KHC-E7xFWh61mD9O3Q_9gf-Qxho= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 4 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1046 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32lsdl6vHUJOLzJWQ0WkBk9t0H-hM_F69mI50m3O7qq1s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1047 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16NualOvuZ9pPE0JQcqfMNqi8otcaKx69vcKn-FKPXHr0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA1_NTB1_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1048 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA1_NTB1_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16RGn038FBreYZmQWbgae1UhiFktt_aCyc_OZlbzMDxPA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB2_NTC4_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1049 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB2_NTC4_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI32cIpEnTcE84fZ9PRiEIWKoRIX7eCuPz1mtyvvWg3c7h4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 134144 + LdsInitCVgprs: false + LdsNumBytes: 134144 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1050 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16HOObusf0o8NdAHXuyBZqLdkOGGwuJvOmTtBo1MNJUfM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB4_NTC4_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 4] + MIWaveTileA: 16 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1051 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB4_NTC4_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16UQbNvSEceN_LPCaQJTQJhZ53iF7mHC2TxEbC9JAyDt0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB7_NTC7_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67072 + LdsInitCVgprs: false + LdsNumBytes: 67072 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 165888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67072 + LdsOffsetMetadata_Blk: 165888 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1052 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA1_NTB7_NTC7_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16ZhBJaC9kNuzX0DxuvPiBUyTs7czkHL2T6LAodogjRYA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1053 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16iWCfOsxx7jtqzHvFdOeuUoXxcQKvozyUXMPq0hRBXDc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB4_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 4] + MIWaveTileA: 16 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 5 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1054 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB4_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16AhdIgk54mOOWxYUqzAf8U4VvuDScAwy08q1ER8wpVlw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB7_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 4] + MIWaveTileA: 16 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 7 + NonTemporalC: 6 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1055 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB7_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16hihObv1T3WmVXc1uIYjLBbI4NSrEq-9PuS1oNbN0anE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1056 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI166XBnTeKzS7DngXrqKvV7A1jTQSRqAxsgnUZz0ZC8Zao= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1057 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI168a6vgjEtsTCl4Q0XdqE3Z5kva194GGS2SzXNm3r2lDU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA0_NTB1_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1058 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA0_NTB1_NTC6_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16BBtPd9TAI-SMRvzVY79RSiFfiE4QRWzXKKDby2jqwPY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB7_NTC6_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1059 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB7_NTC6_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16xJEMM6LYN_e7vy_zaD7OGKbATqZz6KdhiH7HCam2VSA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1060 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA2_NTB0_NTC4_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16PEQAv9oY8qLEOVjBz5ssRS3aC7zKbq9XTXNSRQeir2w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1061 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16Esi0nYSe8fXG-4BxVUxHKaH00tJb4onna0VjpYacw3M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1062 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC1_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI32g92QdEFf4ASBLkEsH7Rj7VWoIOAPO8etK13WZFJvB78= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA2_NTB0_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 134144 + LdsInitCVgprs: false + LdsNumBytes: 134144 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 101888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 101888 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1063 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA2_NTB0_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16zsJEWcUKOyQX_UOIXMCVIagPvbo_-4ymbSHjswClW6k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1064 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16b6MQoTrxALPfBX4ESIm92NC5jFvjc65EJc65MuEttrs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 6 + NonTemporalC: 5 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1065 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB6_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16TdfBACs6vIRfpCjyT5NiH1OqsVAptBCAQGMhqI1aFnE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1066 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16uFIvoH8xDmCn5AEIJq75hujEFC54w18bP1YCDLkk7yY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB4_NTC1_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1067 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB4_NTC1_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16jZqm1c-l_pGzsGAznryzAG7OFgWf1w_WabnvsRGS8Eo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB4_NTC2_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 4 + NonTemporalC: 2 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1068 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB4_NTC2_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16Q2AdOK2nfivw4jm-kfj0Y7kpDhfNv-eHWF7SbAyqMdw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB5_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 5 + NonTemporalC: 7 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1069 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB5_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16WO_NN55n0LZqDtG-_tiQnWxk6RZWoiA2W9I_K8OTt_0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB5_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 5 + NonTemporalC: 7 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1070 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA1_NTB5_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM2_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16XWr7BISLqeiDp9croxxirvcAI7gaRXrmdV3Cu_FLdhY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1071 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI1695FkNHoDkjkfnLW2UzrBc6PvDDhFWtwrdew-CYY_QTY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1072 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI164GutLKCPh4OjvM7H_qT6XkIdkLDGdP3ftdLqyGFK_k4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1073 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC4_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16FAFIsm1iirmXv8ZP5wQgJTKxvUZLK-9T7gOlNRM6NtA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1074 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16K1f1is7knNvgs67ILLXNPZu6zVt7l0vnJtQTi4RuXYQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1075 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC3_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xdUUz19f8D8MI2nFIrQGpCuWZJFaM7Rd5Vs9SEMSWdgo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB2_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1076 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB2_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI13iKeSOb99hXcjX3CIHq3lBqvIRZskJ-O4olkIG0b1L0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1077 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB1_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI32-XT0Fb1H5FkJHG0p-F7E7Qil8yeb2hIr7_xWyvwW3Vs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB3_NTC7_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116736 + LdsInitCVgprs: false + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1078 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB3_NTC7_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x128_MI1qJ4AGc7nfkYrKtLmZgjXBzS3UEN4pDuMCmYTo2zCxOs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA5_NTB3_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 159744 + LdsInitCVgprs: false + LdsNumBytes: 159744 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 79872 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 113664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 113664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1079 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA5_NTB3_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16SA-uhqgOr4Qh_kmIwcpTxdft7wtwadVjh8kqXncwQVc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1080 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA5_NTB3_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x288x64_MI16i-6EXNthkeJDUJNmvbETLZtaN1U_svNblisIqN6BwSg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA3_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 129024 + LdsInitCVgprs: false + LdsNumBytes: 129024 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 288 + MacroTileA: 128 + MacroTileB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 4 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 9 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1081 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA3_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 9 + ThreadTileA: 16 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16Gt4L7W9lspFcka5wdZU0tkh4HDzGsjFwWgh-t_nB-JA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA6_NTB0_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1082 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA6_NTB0_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI320KPGFal3u58tw6FPWrPvcLpz5orX1K_AW2hzLezkSqo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1083 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI167wQVo-YG41mG6WK19J3LCEjVD16RtxByf3rSRmg9fsk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_12_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 12] + MIWaveTileA: 4 + MIWaveTileB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1084 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_12_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 12 + ThreadTileA: 16 + ThreadTileB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x288x64_MI164FDAqPkKkvz8Iv-fvZ_6L0AAGrPLIGAMkSlBsAOeRiU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA2_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63488 + LdsInitCVgprs: false + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 288 + MacroTileA: 128 + MacroTileB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 4 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 9 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1085 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA2_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 9 + ThreadTileA: 16 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x256x64_MI16OqJGd14uQ52gRI1Rsi2UFdPxAm0swwNQHRLjTjfiTSI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 21120 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 21120 + LdsOffsetB_Blk: 86656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21120 + LdsOffsetMetadata_Blk: 86656 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 8] + MIWaveTileA: 5 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1086 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 8 + ThreadTileA: 20 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16bvofrvKNNxN3cveXIgt6QtUZ0U7s3nS0eIiKeEb4xfQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB0_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1087 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB0_NTC7_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16aZk9zeuXQ7r7-Vw6xPDTlAKzJlBebYV6m41NfLjjPYc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB5_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 5 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1088 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB5_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16fO1oASOze0Vg2zoh5hEn16uUUlBzEs4FD9alS1uM5mA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 4] + MIWaveTileA: 16 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1089 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16VJsYpVwb9QqJwaBn2Lit50HD8ymMpP4j6B0HYU4-oXQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA3_NTB0_NTC6_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 4] + MIWaveTileA: 16 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1090 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA3_NTB0_NTC6_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16LMwSxXj5zp3J9K8y9QEbSARIUTRup4D3jQD1UdgXFsI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 4] + MIWaveTileA: 16 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1091 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16bMZKzwvhzoixVP9zCi86UXG06PHHbcvgrOITFtU6sU4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB0_NTC5_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1092 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB0_NTC5_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16bL9wrzQn5yuJSeFskoVsiFz69S8zIX7w-5J5HCkIEJY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB1_NTC7_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1093 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB1_NTC7_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16_HwgB-5ytFjzfDU5AxTK1_7uAqjnRN6UbRkkfxeUqC4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB1_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1094 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB1_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16UPIGRlzE5I7x3ynQYJ-3nh8vEaG7Z582rxLtzwCT5D8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB2_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1095 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB2_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16zkh8X4QQrZTrsCN4WQ7GTYaOnlWMCun0aSaIEMcwQDk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB1_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1096 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB1_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16S6KV_wdPGr0pOqwvsXLdWjTvTzS4xPZ025IRHSW0ugE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1097 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI166mORZ_sQZm-feWyQ7nxv8YLKPNgHVlKW-cOQfRsEkW0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1098 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16noqae25_3KStTyIosT05Yu2Zfuq2kaD8xwGFvRu1JVg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1099 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16C6rbpU94tjmEgJRAd6muxl4-WucqDzcWk2vOzzM1_tY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB0_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1100 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB0_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16dWLIk0a4GB9NXjspL5nt3eYrqi1uWCOpFwVujI_G-wE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA3_NTB1_NTC2_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67072 + LdsInitCVgprs: false + LdsNumBytes: 67072 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 165888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67072 + LdsOffsetMetadata_Blk: 165888 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1101 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA3_NTB1_NTC2_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16HUANMq2EUSkR8b7huy-UXY7HAdoJNx-I6gxGbL8WJek= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1102 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16KGEFctsEFTt8vPGbTqyWYKt4uOgo3ckf_waffwCvhfk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB0_NTC6_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1103 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB0_NTC6_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16zGkhbTybq3q7R8tqcH_041xzWUOs63S0Vfv5vdHNG0E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1104 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16fO1Vl6_RK2pneSEmSiH6889f7udGqZF5en1jPjE7f5s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1105 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16DolEhU6B8peQS3Xr6gLMSHrNYDRRdtZG1_lDoxfVxw4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1106 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16DqXLmuBHw91--EFJGqGve91ZKrAkVU5Dgdf4oFu5USc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB0_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1107 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA2_NTB0_NTC6_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16WCvet_bQ3nd_VHY5vJFI3xOqyx41kdfAbgxo8VKJSA0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA2_NTB2_NTC4_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 16] + MIWaveTileA: 4 + MIWaveTileB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1108 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA2_NTB2_NTC4_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16-VOP_WA2iIQOjjdqGpuPz5kr9KYtc-lRi-KptFlDH_U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB1_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1109 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB1_NTC5_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16nT5mD0zgHNuSuNO3LS4pIiNXb5nn8h4D7ahAEPo_gk8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1110 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16CA7Yb3Smt0lByzCMKtYT8mQpKMYNjHDTKRxOeHnvET4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1111 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16LTAnBB7dnOp3mFrtckU-sVhJHXdp4-pBAGhzx-rgxUI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1112 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI167SQROjbjyojkOid5V1kDdg0N5EBU1mGMl_BEBvSu4zs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1113 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC4_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16M8EcAH0KKwfGMOFiO9vHhf8fw2_F5Q_UiRQsEOLIhPQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB1_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1114 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB1_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16rmzN-rQbzDadClNM6Z-qm0R3b12L4e3hK6iGCsTuU1s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1115 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA3_NTB3_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16lz06vbyMMB-5-9k9BDFIPd3DEjSJ7iIbbmHAva1GmqM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1116 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16GWskxaMjYwjKTmkaq514txv9pulvSv_erT_VgSDE83c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1117 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16tRFlWvKSF04gQ9PZokgQUcOKDhiq0TfToeDxswShft8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1118 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16Sx6-NSD-VB2hLGWLmrGjCnG5l_q7cf0-XzZBce8HOBM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1119 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Ybv8p_jVFypl9sWQQFDDi1cPn8IzH4pi2AefDIhZ-G8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB3_NTC5_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1120 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB3_NTC5_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16LE-teoIckV9SwtN-HF7EEfMn0L4AQgAt_pdBX1jXvj4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1121 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16MgW3z512SZg0bOgviAw73jy_ZBOr8_SGK3gOhEG8QYw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC4_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1122 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC4_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16R_XGnIyKwD_ggRKOz3uhrre8cvOR_2HFLhMFeHl_S2c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1123 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16E4BRmOTUaJeUyAaeCyhjNIjh6GpBB5szm02giyo0mxo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1124 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16ssYgjsPX3NIGr1xmyTwHB7XVv5sKgD0tDoOkgvN53SU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1125 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC7_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16MuVB9bB0zS45imskyXLOsB2x4bbsRUZI5PVWVKQwcY8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC4_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1126 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC4_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16OSkCnaTZhKg4ll1hDwDoJBMt5EdUQkVbuEM1IeK16cU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 4] + MIWaveTileA: 16 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1127 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16J7iDAyLmqVVIehsVSoYUv5fOo-XqS_59meivI4B4Kkc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1128 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16_NOb4H_Rw0hQCSPdl88-jl2JhI09qpVznCp_OeT5aAM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1129 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16bDp3iRHblovZwgU5K1M-8dA0RKxoDsyjHc1s5aDWaro= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC7_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1130 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC7_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16xUZcE0hE2Bib1NNuylLNWifCQmoKaQqEw2uEAoVv1_E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1131 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xFmUG21QAMH0kUGV4BTe2JF_fY92putQfeCuWGZI7mbU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA2_NTB1_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117248 + LdsInitCVgprs: false + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1132 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA2_NTB1_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x256_MI16xicdzBXpHaJqwqfaUVeyNltBgTlxJarQSuj2alvAGbVg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA5_NTB0_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 5 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1133 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA5_NTB0_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x128_MI16x9ijl6pmXU4AiAeoG_II5-NaljFlVEWMycwB2vm_x9AU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110592 + LdsInitCVgprs: false + LdsNumBytes: 110592 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1134 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16njp7CPA9beWp6S-w4r1lrpxYwCoEi-BmazO82Q-QAa0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1135 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA4_NTB0_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x128_MI16xjbWgUiB8fBDD3qF886IbpXQNNUuYQaunGlFUAcRl9A8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB2_NTC1_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110592 + LdsInitCVgprs: false + LdsNumBytes: 110592 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1136 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB2_NTC1_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16oZJsBklcXmO9_EkFw0q2QxqKalKjsNqqSu7fqQgWQKE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA4_NTB1_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1137 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA4_NTB1_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16vdv5_0fbgLcwQ9ftVdstSpW297KX0NAT55Up98EJAPk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1138 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16wW2GdYbBAi4mXKt-RSmebm4FOMJ2nnnfaqgn_vLsIPI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC5_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1139 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB2_NTC5_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16AOWOx76QxjeIBmUHaDWCAlzYn-r3NYlZn2Aymw1nadE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1140 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16HCQocKtpDSk1PmC_MhDWpSrG3Bl8gSHYlLzp3Rvv5xs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1141 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16J-b4eg25Bv93NUcDs4Vr6UOcuDThqY3_tV7ZVspNCcU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 4] + MIWaveTileA: 16 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1142 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x192x64_MI168YC1E6XlJcpiWqvuj2SFey4Y34gcRhSdKxrluzoR1bk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA1_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 177152 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 6] + MIWaveTileA: 10 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 192 + MacroTileA: 320 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 10 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1143 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA1_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 6 + ThreadTileA: 40 + ThreadTileB: 6 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x192x64_MI168YC1E6XlJcpiWqvuj2SFey4Y34gcRhSdKxrluzoR1bk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA1_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 177152 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 6] + MIWaveTileA: 10 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 192 + MacroTileA: 320 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 10 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1144 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA1_NTB3_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 6 + ThreadTileA: 40 + ThreadTileB: 6 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16eDZsMfNwnfVMVs6i8i9BJmt4qLqSm7t3KWlOXVLZYDQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1145 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16IeIPnD0ASsH304iQErCsLeHn4UAt9Zvuggp2bi_iYtQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1146 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16CAdVKf_H9WahvROBJTW2fomY7fLU33Z2lZiHbQin9fA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1147 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16IVcnlac8U14ytqCcaTbrvhpCbfpFud3o2PBOZbeKFoQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1148 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16JBtQL-gXWn1uRsQ2oat1oL2PA04O9DaSF-Ii9Q-0ZUo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1149 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16d2N6DogEQ6hyfXHa_mSAsHyviMx3EZJLgQaxRgHFzts= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1150 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16DfVHvnACULGH37Sa_XE-P7tLcdtlwiwnOAe31qbXFmw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1151 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x512_MI16x1v-3zJDSL2tojKmJzevwA9SB1VS8zFaRMW3BaP-Tt8E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1152 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM0_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x64x128_MI16xYeWN9tZpQCW9DZrfqqTK_y5abbBuQacoY_6asOJCEx0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 110592 + LdsInitCVgprs: false + LdsNumBytes: 110592 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1153 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16yBdWkeYWY_eptRtD922RgiDQ8Dq25ci2fyY-P4ql8kg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1154 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16Ms0dd4RHIG_WMoYIl5-1AwK1cEtHh_9-2asMn2GegQA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1155 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC5_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16RmL8CMKs3abmUopF8c8Pj84PklcDPZ3YJimZqAQaTpo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA2_NTB3_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 16] + MIWaveTileA: 4 + MIWaveTileB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1156 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA2_NTB3_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x160x64_MI16rOVTex7lBbkCoCRJ6Kol_Pt1HKz3iQ9QWoZyuKqlAAY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB3_NTC1_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1157 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB3_NTC1_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT320x192x64_MI16JhWz8HR4Ozx_QFsCymLdPJkp0u7NGVqR0BvhUPY86Ds= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147456 + LdsInitCVgprs: false + LdsNumBytes: 147456 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73728 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 119808 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 119808 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 6] + MIWaveTileA: 10 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 192 + MacroTileA: 320 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 40 + NumLoadsB: 24 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 40 + NumLoadsPerpendicularB: 24 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1158 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 6 + ThreadTileA: 40 + ThreadTileB: 6 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16pWnMbqXR46MGZmtqGnuyZWyBgunOkRzNPXiN9mb_Qp0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 158720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 158720 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 10] + MIWaveTileA: 6 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 320 + MacroTileA: 192 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 6 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1159 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 10 + ThreadTileA: 24 + ThreadTileB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16iUeGTsZpmRqKoDjPGylLXyFkLFuIbSmKPrCHfEfcz0I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB0_NTC7_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1160 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB0_NTC7_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16tTKs9Bobo4896Rn6pyEwLSJG7bw7MA6urE3_e30cug4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1161 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x144x128_MI16kwElsBWslWZFA4inQsiQvPTy8axcvY0uW8140a_FMGI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x144x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_9_MO40_NTn1_NTA0_NTB6_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 125440 + LdsInitCVgprs: false + LdsNumBytes: 125440 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 41472 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 9] + MIWaveTileA: 1 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 144 + MacroTileA: 64 + MacroTileB: 144 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 4 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 9 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1162 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x144x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_9_MO40_NTn1_NTA0_NTB6_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 9 + ThreadTileA: 4 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT384x160x64_MI16AoZiuikCjAtI7o0V9fGYRI8_9mMMWsMRFH275KpZEi4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB2_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 146688 + LdsInitCVgprs: false + LdsNumBytes: 146688 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73344 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 125568 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52224 + LdsOffsetMetadata_Blk: 125568 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 5] + MIWaveTileA: 12 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 160 + MacroTileA: 384 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 48 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 48 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1163 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB2_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 5 + ThreadTileA: 48 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16Y8TxBBS9bifMM7IxNsFmenMisrZ33tZ5QbFR54_-kMA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1164 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16LCG604UGZOHZzw2NaKdk51GP6MMxsvBUNZDLM5_GJ40= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB0_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1165 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB0_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x256_MI16xS155ys4mzM6BQj6FfprjynrI5lFJPW0s0gNUZTurBzk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB7_NTC3_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 3 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1166 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB7_NTC3_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16bfHqywFO81hFhO4oluv3mQgUYD6C3wMstY5HzTDY_B4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1167 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16_sSxkSgDYcBmIClOr8spTw722SRcXTJqaExxhWVHkls= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA3_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1168 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA3_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x64_MI16ZVEcpRilhnVvMOGPEyf_fUl1b1f9WHSxv86xpI7S7Rs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1169 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16-tYhbCxlfbhtzTOPHJhzchd4BVNV8Dc6A6QDANLveMI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1170 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT384x160x64_MI16uoloKMx6I4MRgCUee7moPz0hDCd9mcaOGKOEaQdxSoM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA3_NTB3_NTC7_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 146688 + LdsInitCVgprs: false + LdsNumBytes: 146688 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73344 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 125568 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52224 + LdsOffsetMetadata_Blk: 125568 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 5] + MIWaveTileA: 12 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 160 + MacroTileA: 384 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 48 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 48 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1171 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA3_NTB3_NTC7_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 5 + ThreadTileA: 48 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x128x64_MI32Wtc1yp6Ta3Pua26TOMtOY3BI-_7idMgKAt4367XhbLk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x128x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB2_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 129024 + LdsInitCVgprs: false + LdsNumBytes: 129024 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 111616 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 111616 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 128 + MacroTileA: 320 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1172 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x128x64_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB2_NTC5_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 80 + ThreadTile1: 2 + ThreadTileA: 80 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x256_MI16x-pudhQ7fGawuUXdg64U7yO42tTWWyNIUjdps9hDrwEY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA5_NTB3_NTC5_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 136192 + LdsInitCVgprs: false + LdsNumBytes: 136192 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 68096 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1173 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA5_NTB3_NTC5_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x256_MI16xtt07358_0YVbgycjXjUwRmkDDH_2hGoOdccC0QMrTok= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1174 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x256_MI16x8wxC-6tg13jz0ohIoI0V3uu5KuQ5guTMvbtNKjKx7n4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC2_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1175 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB1_NTC2_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x256_MI16xjLZGvMcn-2XSfwKS5fpF6auy3gUA2UC9lLk6n6pVAGk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA7_NTB2_NTC2_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1176 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA7_NTB2_NTC2_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x16x256_MI1640zrVku2d-o_pKaiM9ZuvyIrrHuvMlJlRNXC5wSoEGM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 2 + LVCA: 32 + LVCB: 128 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 152576 + LdsInitCVgprs: false + LdsNumBytes: 152576 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 76288 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1177 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC16_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x16x64_MI16xiPimPr3JiV3nbFC6U_GRfFiP7RJIaLZ98adTdjtmZGI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA4_NTB2_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 140288 + LdsInitCVgprs: false + LdsNumBytes: 140288 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 70144 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 137728 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 137728 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 16 + MacroTileA: 512 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1178 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x16x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_1_MO40_NTn1_NTA4_NTB2_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x64x64_MI16xzFmIialZJsNxItzr-zDgqFXT8i58bv2y0xoQaYdYieE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_2_MO40_NTn1_NTA7_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 107776 + LdsInitCVgprs: false + LdsNumBytes: 107776 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1179 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_2_MO40_NTn1_NTA7_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16ofycSbuK0LYxPwEekhAMKIySnqQvNtHf2PVhYPHxNvE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA7_NTB3_NTC0_NTD7_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1180 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA7_NTB3_NTC0_NTD7_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x32x64_MI16xGnajFGSc5v9ghEzUrtjydyVsFac202K1eboUzLYoOSE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA6_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 103552 + LdsInitCVgprs: false + LdsNumBytes: 103552 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1181 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x32x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA6_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x128x64_MI16yaHHigmnL0zCRVkhyrhCzJpRof5vB7YpVpjVCZGmwvM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA6_NTB3_NTC1_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84480 + LdsInitCVgprs: false + LdsNumBytes: 84480 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84480 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 128 + MacroTileA: 512 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1182 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA6_NTB3_NTC1_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16Ur1U9ffXVZ7Ka1EXunwGDeUcDnI34yHDXxxeoAdl07o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA5_NTB5_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 5 + NonTemporalC: 1 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1183 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA5_NTB5_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x128x64_MI16V3PL-B3Ing9LzaWSXrtfBGIK2o1DGNuMPKZg-VT7jX0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB4_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84480 + LdsInitCVgprs: false + LdsNumBytes: 84480 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84480 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 128 + MacroTileA: 512 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1184 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT512x128x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA4_NTB4_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16K7EkM19lOgktmwEAw6FVkh7gDmk2zkq4B3cY0qt7EXE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA6_NTB2_NTC2_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1185 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA6_NTB2_NTC2_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16AHyARlXgTjdAI23gNW4dApx6DMJaSfJpM-0cMOXqgLM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA5_NTB2_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1186 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA5_NTB2_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16tx-4ufoPB35PKPzGeDu5RrFF5vOp90Mym77Kr7yJUzU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA4_NTB3_NTC2_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 3 + NonTemporalC: 2 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1187 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA4_NTB3_NTC2_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3TDJTmiFLUEgX0tXG202WiRF6oFxe_0TT4eom7r9_0Ac= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB2_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1188 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB2_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16R-ATXz9Yd6RvRd5kjy1WGtuKGx-vB0uyhTw_Cz4fROc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_6_MO40_NTn1_NTA6_NTB3_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1189 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_6_MO40_NTn1_NTA6_NTB3_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT96x128x128_MI16OrDSdF2N5bMx7BZEKVkeNcXMFOQVnuE1GYd4kFi4p4k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1190 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT6_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16vvzUUIRVm-_WNDYPm_mY7aT3cTQOuxE8xSK3D49GB7M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB2_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1191 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB2_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16ZMmuRq8okPyiGVg9Uc5STuyJssxmYDfpDGCFMgo_u6I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA0_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 3] + MIWaveTileA: 16 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1192 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA0_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM2_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x192x64_MI16dgJxlSqa3PH3ZHKHY2WFZY8G9Uvukp0pn9K9jmhDu_4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_12_MO40_NTn1_NTA2_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 12] + MIWaveTileA: 4 + MIWaveTileB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1193 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_12_MO40_NTn1_NTA2_NTB3_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 12 + ThreadTileA: 16 + ThreadTileB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x192x64_MI160KvHYc14waQOBkL2zi9jZwF3icdFceVrVPedMshh6n4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63488 + LdsInitCVgprs: false + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 6] + MIWaveTileA: 7 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 192 + MacroTileA: 224 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 168 + NumGlobalWriteVectorsPerThread: 168 + NumLoadsA: 7 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1194 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 6 + ThreadTileA: 28 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x192x64_MI165sx0jrMP2bDu9fyIgBFfOxD4DnHxq_GElkCuRlKNwGM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 177152 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 6] + MIWaveTileA: 10 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 192 + MacroTileA: 320 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 10 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1195 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 6 + ThreadTileA: 40 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16kgKz9eDdoAIPzBKjLODCrBuTAfnKwHJZANsd6NDwx_A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1196 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC7_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x192x64_MI16X48J9Ltjr6Fni-YKxBouUUC2mYFqLM8XpLOPd-sDHe8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA3_NTB0_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147456 + LdsInitCVgprs: false + LdsNumBytes: 147456 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73728 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 119808 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 119808 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 6] + MIWaveTileA: 10 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 192 + MacroTileA: 320 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 40 + NumLoadsB: 24 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 40 + NumLoadsPerpendicularB: 24 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1197 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA3_NTB0_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 6 + ThreadTileA: 40 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x192x64_MI16SYrS5uSE4BZwd5zCtirVCnEaiomc9Gwfpg1irfDnwjI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147456 + LdsInitCVgprs: false + LdsNumBytes: 147456 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73728 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 119808 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 119808 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 6] + MIWaveTileA: 10 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 192 + MacroTileA: 320 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 40 + NumLoadsB: 24 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 40 + NumLoadsPerpendicularB: 24 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1198 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 6 + ThreadTileA: 40 + ThreadTileB: 6 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI168TPQoax6F4UkmsaaU3KdrUIni5DMVXBrW9IC0RhNmv0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1199 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16CTefB-e7cCg4ntBxNlMXk5tK97DQ4ISlEZQ0v1ZQdP8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA6_NTB0_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 107776 + LdsInitCVgprs: false + LdsNumBytes: 107776 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 6 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1200 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA6_NTB0_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16TSGusewBQQP1TvscvqWbAy6gVUSgIEKyX_h0ovl0PD4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1201 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB3_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x192x64_MI162KXdL087ltM0nHmSSdKqcCJ_I5ZgMGqklXJFgkMJV58= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB3_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147456 + LdsInitCVgprs: false + LdsNumBytes: 147456 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73728 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 119808 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 119808 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 6] + MIWaveTileA: 10 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 192 + MacroTileA: 320 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 40 + NumLoadsB: 24 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 40 + NumLoadsPerpendicularB: 24 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1202 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA2_NTB3_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 6 + ThreadTileA: 40 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xaZOyY1JvgeZrlYRLln7CdOx3Ey8o_7PdzCBRYeyIqI0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB5_NTC7_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35328 + LdsInitCVgprs: false + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 5 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1203 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB5_NTC7_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x80x128_MI16nWElDIMquA3ot6AV3fbrgjigAA3FZGb6UR_F8ui9JDg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB4_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57856 + LdsInitCVgprs: false + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1204 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB4_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16xRp_h3XNfKJgKuXa390hR_a_PA_ozB8s58WbDDA8vco= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1205 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT384x160x64_MI16EU1PEmRnXOvL0QHfLfUoevgFsfi_zte_dZX-qyVqCTs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 146688 + LdsInitCVgprs: false + LdsNumBytes: 146688 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73344 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 125568 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52224 + LdsOffsetMetadata_Blk: 125568 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 5] + MIWaveTileA: 12 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 160 + MacroTileA: 384 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 48 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 48 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1206 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 5 + ThreadTileA: 48 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16AMWL3EVBr59_lpeIRjHZ5OTH_ogtoF3ZIEa5R4OAZ1A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB1_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1207 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB1_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16wXkV5TwwY5Cn1b-JpZBDfovmW55D-0rmv6LNF7tIHLk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1208 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16ykvPBnHYAGU2gzFcLmZFD7tn0-B_zHzocTEkJxoRSzM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1209 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16I-I_p9PW9YTq4muXWI4l4cM_6YWQsOFlAeiPYcYnBbg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1210 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x512x64_MI32hb5tzjXwCzldRFZdXoQyPxBPkv6tY4c23Yw4cMCM6uc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x64_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84480 + LdsInitCVgprs: false + LdsNumBytes: 84480 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 147968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84480 + LdsOffsetMetadata_Blk: 147968 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 512 + MacroTileA: 128 + MacroTileB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1211 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x64_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x96x64_MI16x8ifhaVedsGvXtVvssmJeGqcuP6lzVwXQQU3EBLvx1ik= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA2_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 96 + MacroTileA: 192 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1212 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA2_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x512x64_MI32fDHzmK6xXv9KWD_zOxf86BgGGKPdBqZvbIC9H9_dJPo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x64_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84480 + LdsInitCVgprs: false + LdsNumBytes: 84480 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 147968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84480 + LdsOffsetMetadata_Blk: 147968 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 512 + MacroTileA: 128 + MacroTileB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1213 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x64_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB2_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x64_MI16xofYKRGZZFMipcebwqyHJvXrkAsHMwyx3fDtvPyIP_3o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59392 + LdsInitCVgprs: false + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1214 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD6_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1rzg0c0k-G552AVJtHvjXC_hzf6O8g9UuJzfRdWbtvJk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB5_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 134144 + LdsInitCVgprs: false + LdsNumBytes: 134144 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 6 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1215 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB5_NTC6_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI11rtTFRPNt4P5Va9APcSMLjHPb84VZ_C5_J1XxdlxNLg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB6_NTC0_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 0 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1216 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB6_NTC0_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x112x64_MI16oOsVY6D-NipSJxy10I5Rw5jRX6kT2cwwtrRzLOz54fk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x112x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB3_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 52736 + LdsInitCVgprs: false + LdsNumBytes: 52736 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 17920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 112 + MacroTileA: 256 + MacroTileB: 112 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1217 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x112x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB3_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16x5nGlymsVtuy0UhY3byt5q7qpRM_MuGfMJE_XfThop3A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1218 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM0_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xLRdvQHXKSCos5Tu-D11kfmXmkI9jSpB1lGsPvG2GoAI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC1_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1219 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB2_NTC1_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x48x256_MI16xWMH4TKhQRr2HJbKo9Afjor4Fu0shSgIudw1EsHpYQt8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA1_NTB1_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1220 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA1_NTB1_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x48x256_MI16xIiqpyrq_uqEeJj6BiF3ZmZrgcsqh5b0y013DTgUVW9c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA2_NTB0_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1221 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA2_NTB0_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x96x128_MI16xP1L5z9WZqjHH7WZDMoE5_HmjExSEsFkiXWZJNTy9ac= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA1_NTB5_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 162816 + LdsInitCVgprs: false + LdsNumBytes: 162816 + LdsNumElementsAlignedA: 55296 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 81408 + LdsOffsetB: 55296 + LdsOffsetB_Blk: 136704 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 136704 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [3, 6] + MIWaveTileA: 3 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 96 + MacroTileA: 192 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 5 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1222 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA1_NTB5_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 6 + ThreadTileA: 12 + ThreadTileB: 6 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xyV07fwu9mTddfa7GWyFIPasGkXOBxRkCVDopz1XMCLY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1223 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x16x256_MI16xqWBuRuZWKZqWHHvNoJYjVGO_ekpT4MJGqXZdUhRH7rk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC2_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 2 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1224 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC2_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM0_WGMXCC16_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16CH1VcmCQmierqW5AI3bjXgohKh5G7hJmekg943ulY18= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA3_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 4] + MIWaveTileA: 16 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1225 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA3_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x96x128_MI16x5j7S8DsiiOwJFsOXYUlJ1sagQRIBIIxRZnRkoen9XFY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 102400 + LdsInitCVgprs: false + LdsNumBytes: 102400 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 1 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1226 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16Vji1j4OxfsLqG_xzGoVP-y3iYAloZaNl0PB8lNHJwcc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB0_NTC7_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1227 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB0_NTC7_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1nMK6AfLdIroY7RvkxdBHqku3JZRVsx_AwMoWOzZPuEE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA1_NTB1_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54656 + LdsInitCVgprs: false + LdsNumBytes: 54656 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1228 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA1_NTB1_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16xmnyarJco_A8IfBxdNGDCY0IdmOXcznvmDyuAKXCarnw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59904 + LdsInitCVgprs: false + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1229 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x160x64_MI16xJ4zCv-v4yPVlnVRuvGsRDc3yy_OXzgPhOxGqze9I82o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA2_NTB3_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100352 + LdsInitCVgprs: false + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1230 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA2_NTB3_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI163BB2OdtcDRq0YTxjzncmz9jNVN2ieeiaJV92h-h8bkk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 77312 + LdsInitCVgprs: false + LdsNumBytes: 77312 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 51200 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 157184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 77312 + LdsOffsetMetadata_Blk: 157184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 5] + MIWaveTileA: 12 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 320 + MacroTileA: 192 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1231 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 5 + ThreadTileA: 48 + ThreadTileB: 5 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI32pa90HkBFZszgmyk8cSWj7VJh6x6NoAmXyTa1-wCPPAw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB2_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1232 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI32x32x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB2_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM48_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16VAoYLWEAzwFDxDd54BeMVgztslaKoupk2qXOkZfcy6M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1233 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x128_MI16m67iHGRkyjsua8Y3ZAKWIQ3lrNKmV0GybKAIHGx651s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_6_MO40_NTn1_NTA1_NTB3_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1234 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_6_MO40_NTn1_NTA1_NTB3_NTC0_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16Sc9ZnmNmQyJZzXk5_bTgKnXoMLctw_GCr4c6imMZKAU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1235 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16XNYwhkB1i3h3py0aWfqcrtZp3o_uju369IqAuDh0D9Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [16, 4] + MIWaveTileA: 16 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1236 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT16_4_MO40_NTn1_NTA0_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x96x64_MI16x4DHRGJR75pOZw4c5p5rnEZpn666-eMOFPEiWzJf16cs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB1_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62336 + LdsInitCVgprs: false + LdsNumBytes: 62336 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1237 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA3_NTB1_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM7_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16i8t-GgU5Q80x4TX7poc8O8Z6fliYd9KW4Fk6McpaUsg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB5_NTC2_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 2 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1238 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA2_NTB5_NTC2_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x160x64_MI16HeJ3HkPhXArbTaCzCEhModAMu1qfY2zEsIiaUPHpOjo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 103552 + LdsInitCVgprs: false + LdsNumBytes: 103552 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1239 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA1_NTB1_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x64_MI16zxF5V4DipeTyDKU2HauhCwa4opcsH-3KSbbqpAojhUY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB2_NTC5_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 107776 + LdsInitCVgprs: false + LdsNumBytes: 107776 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1240 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB2_NTC5_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16T8bIoEQ8w83-veNnIiq2cRL-zF5zcd9j_KxG5-gph6w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1241 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16y41iHlpqnwrl4puQwXWETtJTEdTi-uR6doRiTEcPSOI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1242 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI163S619iUTTs1DRCJUrp43Nutt3LrBnhsWmzF-r3we8_g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1243 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x224x64_MI16SikajvzVA5OXLAR1plgs1OpeZpGvWkBbL6pEQQzseWY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1244 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16bR3OkMogEfrqwiHIPFS1G8qSzbA3OzRizISozfX8yn8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1245 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB3_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x160x64_MI16Ikz9NrEog9Iab0ACUAMKFprssfdTaWu0Sn-WfcoA4yQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB2_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120448 + LdsInitCVgprs: false + LdsNumBytes: 120448 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 6 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1246 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA3_NTB2_NTC1_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x128_MI16xq2mNsqje0dAsoGXSlhSzpzKYmt1D4fA2PGazwbV3lwY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC3_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 13312 + LdsInitCVgprs: false + LdsNumBytes: 13312 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 25088 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 13312 + LdsOffsetMetadata_Blk: 25088 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1247 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC3_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x512x32_MI16U44d2XEmaaLr7fxfEIJvdiGdIfQ7-6dJT0GInqIzp2E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 107776 + LdsInitCVgprs: false + LdsNumBytes: 107776 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 512 + MacroTileA: 128 + MacroTileB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1248 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB1_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x16x256_MI16xxe0lbF1wYDgdwMiICCtw-8ZmcwHCc0IyKMRBC_Z4OuA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1249 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x224x64_MI16xdvP8knMbPwcl7ZvrEk-6U7Slr8ePNn9-ax9TSWYIvFQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 70656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 70656 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 7] + MIWaveTileA: 1 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 224 + MacroTileA: 32 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 1 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 14 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1250 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 7 + ThreadTileA: 4 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x320x64_MI16xQcS-IB5os2-njnft6oKfGtmZilIzSH7Yf8qnoMMZoUw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55808 + LdsInitCVgprs: false + LdsNumBytes: 55808 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 51200 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 70144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55808 + LdsOffsetMetadata_Blk: 70144 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 320 + MacroTileA: 32 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 1 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1251 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC16_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x384x64_MI32xVmLymHmALnw8uJzr2rIVhG01yzw4b-CihZZAolB_J94= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x384x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 6] + MIWaveTileA: 1 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 384 + MacroTileA: 64 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1252 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x384x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16xljLkR-MC2Kp7r4mQWnSgh-_rAHdYEmvYJ0vhf_HZrbE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59392 + LdsInitCVgprs: false + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1253 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x16x128_MI16xY_fB8yBjfBVTqf21JvYZMtCM0wbalgcOF0cbbjUrcj8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55808 + LdsInitCVgprs: false + LdsNumBytes: 55808 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1254 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x256_MI16xK_IdtcbXjwpCsQHhFBzb3dEZoarGsVFXe0p_YKxZSjo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1255 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x32_MI32hF-nawwsg1CZEExwm7AU15iYRlpBXTHaGjx3S2MwWFw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 8] + MIWaveTileA: 1 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1256 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM48_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x64_MI16x1RJsBXy1EedBpQvnlTww6ISCXq0R26MRxhWKF54ouZ_Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 24832 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 24832 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1257 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT16x64x128_MI16xYDO1iJj3E-JDD2XpyQJtuSnyKAvVRi3BkhUYaZqQeuY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55808 + LdsInitCVgprs: false + LdsNumBytes: 55808 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 37376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1258 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16xKcoHi0rxFknrT_7yBPgOnz2MfWxKC3xFrC0Ni7xVUzI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59904 + LdsInitCVgprs: false + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1259 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x32x128_MI16xlFDa2K3MuMsbiAffy908Ci3OR10WpBziR3hLTuSX7z0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1260 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16xIKXHmd8gZNhv-zbcLN0faFPYz43Jrl2CV2wrzpFB0a8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59904 + LdsInitCVgprs: false + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1261 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16xjTnWDBj9gfbgCPi_eVN5gKALAsymxsBI1IBw6MBcv4Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59904 + LdsInitCVgprs: false + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1262 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x48x128_MI16xh6jJh0ws8zzps7fYbjhs6hy_-3ZUYAAUjeK9k60roG8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 + LdsInitCVgprs: false + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1263 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI32Rffl6ymUFAp98jlfV4YXC6pdaxdjkw2_VHdOmM4jHqo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 136192 + LdsInitCVgprs: false + LdsNumBytes: 136192 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 68096 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1264 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 128 + ThreadTile1: 2 + ThreadTileA: 128 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x320x64_MI32bsLqIqAcetg6HG7D18y2X07rgxcu8_ErA6h27oGBrIs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x320x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 159744 + LdsInitCVgprs: false + LdsNumBytes: 159744 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 79872 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 113664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 113664 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1265 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x320x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 5 + ThreadTileA: 64 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT304x256x32_MI16Tjjrwdw9U46olzLPfOBMR5EhgMS12b4ye0cB8SpssnA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT304x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT19_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 108288 + LdsInitCVgprs: false + LdsNumBytes: 108288 + LdsNumElementsAlignedA: 24320 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24320 + LdsOffsetB_Blk: 89856 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24320 + LdsOffsetMetadata_Blk: 89856 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [19, 4] + MIWaveTileA: 19 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 304 + MacroTile1: 256 + MacroTileA: 304 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 304 + NumGlobalWriteVectorsPerThread: 304 + NumLoadsA: 19 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 19 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1266 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT304x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT19_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 76 + ThreadTile1: 4 + ThreadTileA: 76 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT160x224x64_MI1648bVaAUzsdymA40ZBReGgUBY9wSXQKOIE6GYdWx-uok= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 21120 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 21120 + LdsOffsetB_Blk: 86656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21120 + LdsOffsetMetadata_Blk: 86656 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 7] + MIWaveTileA: 5 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 224 + MacroTileA: 160 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 140 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 5 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1267 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 7 + ThreadTileA: 20 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x64x128_MI16xa6ICeA7RTUgxrRVggJel5U7xK5T7ToXE1EmSGkbS5_k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1268 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT352x192x64_MI16l1vEnPkVuHET7NK2ATDb0pU-YiGihu1HUgRCpu7gOA4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT352x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT11_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 83968 + LdsInitCVgprs: false + LdsNumBytes: 83968 + LdsNumElementsAlignedA: 56320 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 56320 + LdsOffsetB_Blk: 187392 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 83968 + LdsOffsetMetadata_Blk: 187392 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [11, 6] + MIWaveTileA: 11 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 352 + MacroTile1: 192 + MacroTileA: 352 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 264 + NumGlobalWriteVectorsPerThread: 264 + NumLoadsA: 11 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 11 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1269 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT352x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT11_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 44 + ThreadTile1: 6 + ThreadTileA: 44 + ThreadTileB: 6 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT384x192x64_MI32Zpo8goNvpSlOIM0c0x1NSjHMZwETEAcX_HCkN0AJHWE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x192x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 159744 + LdsInitCVgprs: false + LdsNumBytes: 159744 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 79872 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 132096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52224 + LdsOffsetMetadata_Blk: 132096 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 192 + MacroTileA: 384 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1270 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x192x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 16 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x320x64_MI16IbznA20tkEc7Vj4IJ67XlE06oSnbtJs8HSzot_SXq6s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 81920 + LdsInitCVgprs: false + LdsNumBytes: 81920 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 166912 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 81920 + LdsOffsetMetadata_Blk: 166912 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 10] + MIWaveTileA: 7 + MIWaveTileB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 320 + MacroTileA: 224 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 280 + NumGlobalWriteVectorsPerThread: 280 + NumLoadsA: 7 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1271 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 10 + ThreadTileA: 28 + ThreadTileB: 10 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x32x256_MI16xvu2kQO08CL2bF1aHWqWlZ9pMX10-g7H0PnZ6cGUDiEA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117248 + LdsInitCVgprs: false + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1272 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT64x96x64_MI16x1F1fhxsSSmWc3n3OLho1FVytNcfOtjYTn4OR17BAGQ_g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24576 + LdsInitCVgprs: false + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1273 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x352x64_MI16CsJoK9idtPSUD8Ty7Ztn7FpFCVuWcTWXdguIEd4CJv0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x352x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 157440 + LdsInitCVgprs: false + LdsNumBytes: 157440 + LdsNumElementsAlignedA: 32256 + LdsNumElementsAlignedB: 46464 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78720 + LdsOffsetB: 32256 + LdsOffsetB_Blk: 110976 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 110976 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 11] + MIWaveTileA: 7 + MIWaveTileB: 11 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 352 + MacroTileA: 224 + MacroTileB: 352 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 308 + NumGlobalWriteVectorsPerThread: 308 + NumLoadsA: 28 + NumLoadsB: 11 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 28 + NumLoadsPerpendicularB: 11 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1274 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x352x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 11 + ThreadTileA: 28 + ThreadTileB: 11 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x224x64_MI16N1M5rkedW9IIBVtKqKgnIaGtLErocWo9rCj4fQQxbBI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 151296 + LdsInitCVgprs: false + LdsNumBytes: 151296 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 29568 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 75648 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 121728 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 121728 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 7] + MIWaveTileA: 10 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 224 + MacroTileA: 320 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 280 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 40 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 40 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1275 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 7 + ThreadTileA: 40 + ThreadTileB: 7 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI16RHMEUqyJMPgtxlEMcaBCkYo-nIQbn_jLPePb0FrByaY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1276 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT320x224x64_MI16QWWiu2I6N3q4RKm0bYWYGdmLju8YmPdMM2od6IXGxUA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 81920 + LdsInitCVgprs: false + LdsNumBytes: 81920 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 81920 + LdsOffsetMetadata_Blk: 177152 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [10, 7] + MIWaveTileA: 10 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 320 + MacroTile1: 224 + MacroTileA: 320 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 280 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 10 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1277 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 7 + ThreadTileA: 40 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x0QZ3Xd_U4bAcjnf2AhgHkVcKZt4sFFcqMwG-cj7oUs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1278 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16xUdp65sJqNoqnh6_4ZUgmPACJNOhuYax1SY1F1-WMB5U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumTotalPackedLoadsA: 2 + NumTotalPackedLoadsB: 4 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1279 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x128_MI16zpFtN9LdkBZoDzbM36b0TsUCZeT7PdmyQPjRpwZgPFA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116224 + LdsInitCVgprs: false + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumTotalPackedLoadsA: 8 + NumTotalPackedLoadsB: 4 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1280 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x32_MI16510iu-d4Np1RpdbP6m1mk7l4bnEoYz1VU7F44UA8jng= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54656 + LdsInitCVgprs: false + LdsNumBytes: 54656 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumTotalPackedLoadsA: 8 + NumTotalPackedLoadsB: 3 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1281 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16F3kReQ2eH9TE-qSkS5O0f5m_mKkDGWkDtpg_-LiGDPo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35328 + LdsInitCVgprs: false + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1282 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x352x64_MI162pLXduUvBQP0-lV9atvoPb18J3juYG1iEYmbEfJdyzs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x352x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_11_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 83968 + LdsInitCVgprs: false + LdsNumBytes: 83968 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 56320 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 158720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 83968 + LdsOffsetMetadata_Blk: 158720 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 11] + MIWaveTileA: 6 + MIWaveTileB: 11 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 352 + MacroTileA: 192 + MacroTileB: 352 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 264 + NumGlobalWriteVectorsPerThread: 132 + NumLoadsA: 6 + NumLoadsB: 11 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 11 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1283 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x352x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_11_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 11 + ThreadTileA: 24 + ThreadTileB: 11 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16Ars6JuwHiKDSsjI9NlZXKxYKv7f9EZfkB00w5X7ciWk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30720 + LdsInitCVgprs: false + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 30720 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1284 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x288x64_MI16gYi6KcVjhFqquzIAsZ7MfVfRHg_pbA_zwydTMzBA1T8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 158720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 158720 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 9] + MIWaveTileA: 6 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 288 + MacroTileA: 192 + MacroTileB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 216 + NumGlobalWriteVectorsPerThread: 108 + NumLoadsA: 6 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 9 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1285 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 9 + ThreadTileA: 24 + ThreadTileB: 9 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16qGg4gMNzi8gLYbcwfQtpkFvZdpiVCgTgsuPxf3sOe7E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59264 + LdsInitCVgprs: false + LdsNumBytes: 59264 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 12672 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 46592 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 13824 + LdsOffsetMetadata_Blk: 46592 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 3] + MIWaveTileA: 12 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumTotalPackedLoadsA: 12 + NumTotalPackedLoadsB: 3 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1286 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 3 + ThreadTileA: 48 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16Taiofo8LsVqbUFqVYq-ILVex9ddEbciO8EKO2rFKNt0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61952 + LdsInitCVgprs: false + LdsNumBytes: 61952 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 46592 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 13824 + LdsOffsetMetadata_Blk: 46592 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 3] + MIWaveTileA: 12 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1287 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 3 + ThreadTileA: 48 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16Z_kSdkcI4g5s7lc4MHuNzxTskYaE1_oj4Ot5voiNmXg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126464 + LdsInitCVgprs: false + LdsNumBytes: 126464 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1288 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16ZdvaJkAZRurjQXNvGSJkYtmmK3QsUQdjL6rt-AFyQLc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 25344 + LdsNumElementsAlignedB: 42240 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 25344 + LdsOffsetB_Blk: 92928 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25344 + LdsOffsetMetadata_Blk: 92928 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 5] + MIWaveTileA: 12 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 320 + MacroTileA: 192 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumTotalPackedLoadsA: 6 + NumTotalPackedLoadsB: 10 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1289 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 5 + ThreadTileA: 48 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16auSttg0EKVCahHTsCVhh2cHh6K9v0owM5LZjw9upYO0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 25344 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25344 + LdsOffsetB_Blk: 90880 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25344 + LdsOffsetMetadata_Blk: 90880 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: 6 + NumTotalPackedLoadsB: 8 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1290 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1-1hEVZXQwB7Hws6RaqtCFGEV7Acaqu3sY_RqiOOC7RE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 136192 + LdsInitCVgprs: false + LdsNumBytes: 136192 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 68096 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1291 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM24_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16FNZI1UQPjZLy0q7-OOgjKhZRyX00yKIq5CfnQfl_V3c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1292 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16Gjd69lk-6PFMJ4hivbEFNUHWuAsTA0DZOYw6wfKBqEk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59392 + LdsInitCVgprs: false + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1293 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 64 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 2 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI162KTaXr7l0KxzQ5okfYZt6ELI8DjoxXRRlCeFBY7TRq0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58112 + LdsInitCVgprs: false + LdsNumBytes: 58112 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: 4 + NumTotalPackedLoadsB: 2 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1294 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO4_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 2 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16kY8ELbPpI_HvuAM-0iLbQ3XhdpmE6DmGEqo9Lq22fls= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: 8 + NumTotalPackedLoadsB: 8 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1295 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16VB2tykcLAxxPQCIrT_2sMKERzY3mGwD7KIGoTP2KAu0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1296 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI165u1frtcv6LOOFyTOroan3pGoqINhRgHC2_Td60yfhm0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99328 + LdsInitCVgprs: false + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumTotalPackedLoadsA: 4 + NumTotalPackedLoadsB: 4 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1297 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI1GS5kxWaKBHhotJrg4AiEHwBFGMMKDZHMHRwyi5luodo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 136192 + LdsInitCVgprs: false + LdsNumBytes: 136192 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 68096 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 102912 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 102912 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1298 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3NvZvxpGo9E4DLXcNaAG_Q1fEQ_9-rCX24hEk7ZcKb8U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1299 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM24_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x256x64_MI16AooNQ0DxCE1iMNXzqX0FHV8K0vAlx-m-ncqv9HuoLPs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 139264 + LdsInitCVgprs: false + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 105472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 105472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 8] + MIWaveTileA: 7 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 256 + MacroTileA: 224 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLoadsA: 7 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1300 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 8 + ThreadTileA: 28 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI3rh3QQEBSU5ZwlIzZgLp80EZwlPEoe5VChZIZ7RQ89CY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 136192 + LdsInitCVgprs: false + LdsNumBytes: 136192 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 68096 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1301 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 1 + ThreadTileA: 64 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI32Jze6abHJ8l5M0yYk0uV7vwVwvrcqi7qDRWfQz8wCrH8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116736 + LdsInitCVgprs: false + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1302 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI3207RbkMHStFD_YoILeumHizH3M8jeERDgPC5Q3QefifM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116736 + LdsInitCVgprs: false + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1303 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32XwDoh2Lmq9ee-Smh9-VB0ZyuLvkIBEspEN9Q2R_QhGo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116736 + LdsInitCVgprs: false + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI165JlqJQiebX2lT4lUU-UJbJ4riuk32JddM1ZpBJ38BuA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: 8 + NumTotalPackedLoadsB: 8 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1305 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW8_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16ioP1gzZP2YiHyhxAp-lrgdDadhVY-tEt9vfS_ve4KV4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 137216 + LdsInitCVgprs: false + LdsNumBytes: 137216 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 68608 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 103424 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 103424 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 16] + MIWaveTileA: 4 + MIWaveTileB: 16 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1306 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_16_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x128x32_MI32CpkerEf2mAVVrDatpBRwuRndjKbhR0RH7EPU5eAZMNc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 128 + MacroTileA: 512 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1307 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT512x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 64 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT23wJ8qNpK8Q_UogSvwK03hvcukDrQtjltSVIFXFfA4es= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x240x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 1024 + LdsBytesNoAmax: 136704 + LdsInitCVgprs: false + LdsNumBytes: 136704 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 34560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 68352 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 102144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 102144 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: -1 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 15] + MIWaveTileA: 4 + MIWaveTileB: 15 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 240 + MacroTileA: 256 + MacroTileB: 240 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 8 + NumLoadsB: 30 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 30 + NumThreads: 256 + NumTotalPackedLoadsA: 8 + NumTotalPackedLoadsB: 30 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1308 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x240x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCCn1_WGMXCCGn1 + SourceSwap: true + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: true + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 15 + ThreadTileA: 16 + ThreadTileB: 15 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: -1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT2GQQfPVRirXfdyXyJUWH87knWjo4-PQrW8_xmR_z5WEU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x208x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 1024 + LdsBytesNoAmax: 129280 + LdsInitCVgprs: false + LdsNumBytes: 129280 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 29952 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: -1 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 13] + MIWaveTileA: 4 + MIWaveTileB: 13 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 208 + MacroTileA: 256 + MacroTileB: 208 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 208 + NumGlobalWriteVectorsPerThread: 52 + NumLoadsA: 8 + NumLoadsB: 26 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 26 + NumThreads: 256 + NumTotalPackedLoadsA: 8 + NumTotalPackedLoadsB: 26 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1309 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT256x208x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM1024_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: true + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 13 + ThreadTileA: 16 + ThreadTileB: 13 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2GGMqhPpxk2J1Q1819y_V3YsuUalVXKAw0Crcj6ZVwr0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128896 + LdsInitCVgprs: false + LdsNumBytes: 128896 + LdsNumElementsAlignedA: 29568 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 29568 + LdsOffsetB_Blk: 95104 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29568 + LdsOffsetMetadata_Blk: 95104 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 8] + MIWaveTileA: 7 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 256 + MacroTileA: 224 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLoadsA: 7 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: 7 + NumTotalPackedLoadsB: 8 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1310 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 8 + ThreadTileA: 28 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1iMO6yjTG4yuZPDkcMXrcObwuUOXV_ipLalV5INSQqYE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123776 + LdsInitCVgprs: false + LdsNumBytes: 123776 + LdsNumElementsAlignedA: 24960 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24960 + LdsOffsetB_Blk: 90496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24960 + LdsOffsetMetadata_Blk: 90496 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: 6 + NumTotalPackedLoadsB: 8 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1311 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [16, 368640, 1, 224] + - [0, 0.0] + - - [121, 491520, 1, 105] + - [1256, 0.0] + - - [256, 1280, 1, 8192] + - [950, 0.0] + - - [128, 3072, 1, 440] + - [1258, 0.0] + - - [160, 3072, 1, 316] + - [1, 0.0] + - - [384, 245760, 1, 648] + - [2, 0.0] + - - [208, 491520, 1, 1224] + - [3, 0.0] + - - [256, 3072, 1, 316] + - [1261, 0.0] + - - [256, 3072, 1, 440] + - [1262, 0.0] + - - [256, 3072, 1, 888] + - [1263, 0.0] + - - [2048, 1280, 1, 20512] + - [946, 0.0] + - - [1032, 245760, 1, 4096] + - [4, 0.0] + - - [304, 368640, 1, 624] + - [5, 0.0] + - - [512, 1792, 1, 2048] + - [6, 0.0] + - - [320, 3072, 1, 316] + - [7, 0.0] + - - [320, 3072, 1, 440] + - [8, 0.0] + - - [320, 3072, 1, 888] + - [1268, 0.0] + - - [352, 245760, 1, 3600] + - [9, 0.0] + - - [400, 3072, 1, 512] + - [10, 0.0] + - - [440, 3072, 1, 256] + - [11, 0.0] + - - [424, 286720, 1, 1872] + - [12, 0.0] + - - [512, 3072, 1, 72] + - [13, 0.0] + - - [512, 3072, 1, 240] + - [1273, 0.0] + - - [512, 3072, 1, 2048] + - [14, 0.0] + - - [768, 3072, 1, 2048] + - [15, 0.0] + - - [768, 3072, 1, 3840] + - [16, 0.0] + - - [768, 3072, 1, 4480] + - [953, 0.0] + - - [768, 3072, 1, 5120] + - [17, 0.0] + - - [888, 3072, 1, 256] + - [18, 0.0] + - - [1024, 1280, 1, 5632] + - [19, 0.0] + - - [1024, 1792, 1, 13184] + - [949, 0.0] + - - [1024, 1792, 1, 64704] + - [952, 0.0] + - - [2304, 1280, 1, 9600] + - [20, 0.0] + - - [1024, 3072, 1, 4800] + - [942, 0.0] + - - [1280, 1568, 1, 512] + - [21, 0.0] + - - [1536, 1792, 1, 3200] + - [22, 0.0] + - - [1536, 3072, 1, 4200] + - [945, 0.0] + - - [1536, 3072, 1, 19776] + - [23, 0.0] + - - [1880, 3072, 1, 512] + - [24, 0.0] + - - [1920, 3072, 1, 2048] + - [25, 0.0] + - - [2048, 1792, 1, 8224] + - [26, 0.0] + - - [2048, 3072, 1, 1880] + - [27, 0.0] + - - [3200, 1792, 1, 64704] + - [1275, 0.0] + - - [3264, 3072, 1, 256] + - [28, 0.0] + - - [3840, 1280, 1, 5692] + - [29, 0.0] + - - [9600, 1280, 1, 5632] + - [30, 0.0] + - - [4200, 3072, 1, 256] + - [31, 0.0] + - - [98304, 1280, 1, 9600] + - [1278, 0.0] + - - [264, 160, 1792, 32] + - [32, 0.0] + - - [192, 4090, 1280, 32] + - [33, 0.0] + - - [4090, 192, 1280, 32] + - [34, 0.0] + - - [257, 160, 1792, 32] + - [35, 0.0] + - - [192, 641, 1280, 32] + - [36, 0.0] + - - [641, 192, 1280, 32] + - [37, 0.0] + - - [2126, 192, 1280, 32] + - [38, 0.0] + - - [160, 286720, 1, 257] + - [39, 0.0] + - - [20480, 1792, 1, 3200] + - [1277, 0.0] + - - [256, 1792, 1, 256] + - [40, 0.0] + - - [256, 1280, 1, 256] + - [41, 0.0] + - - [384, 245760, 1, 641] + - [1270, 0.0] + - - [5632, 1280, 1, 1024] + - [1276, 0.0] + - - [120, 3072, 1, 96] + - [42, 0.0] + - - [120, 3072, 1, 120] + - [43, 0.0] + - - [120, 3072, 1, 160] + - [1254, 0.0] + - - [160, 1219, 3072, 40] + - [44, 0.0] + - - [1219, 160, 3072, 40] + - [45, 0.0] + - - [208, 491520, 1, 1219] + - [46, 0.0] + - - [128, 491520, 1, 112] + - [47, 0.0] + - - [1920, 1280, 1, 1920] + - [48, 0.0] + - - [24, 3072, 1, 256] + - [49, 0.0] + - - [2048, 1792, 1, 59744] + - [947, 0.0] + - - [120, 1024, 1, 64] + - [50, 0.0] + - - [32, 491520, 1, 105] + - [1251, 0.0] + - - [64, 491520, 1, 1224] + - [51, 0.0] + - - [256, 1280, 1, 49152] + - [943, 0.0] + - - [120, 3072, 1, 144] + - [1253, 0.0] + - - [120, 3072, 1, 240] + - [52, 0.0] + - - [120, 3072, 1, 512] + - [1255, 0.0] + - - [125, 3072, 1, 320] + - [1257, 0.0] + - - [256, 1792, 1, 5120] + - [948, 0.0] + - - [192, 1280, 1, 2048] + - [53, 0.0] + - - [160, 3072, 1, 2048] + - [54, 0.0] + - - [160, 286720, 1, 264] + - [55, 0.0] + - - [256, 245760, 1, 648] + - [56, 0.0] + - - [256, 3072, 1, 7524] + - [941, 0.0] + - - [256, 10240, 1, 256] + - [57, 0.0] + - - [1025, 245760, 1, 2126] + - [58, 0.0] + - - [512, 1280, 1, 240] + - [59, 0.0] + - - [512, 3072, 1, 3000] + - [60, 0.0] + - - [840, 3072, 1, 256] + - [61, 0.0] + - - [1536, 3072, 1, 3264] + - [62, 0.0] + - - [1536, 3072, 1, 48760] + - [1274, 0.0] + - - [2048, 3072, 1, 2560] + - [63, 0.0] + - - [3200, 1792, 1, 1536] + - [64, 0.0] + - - [3200, 1792, 1, 13184] + - [65, 0.0] + - - [6400, 3072, 1, 256] + - [66, 0.0] + - - [7524, 3072, 1, 256] + - [67, 0.0] + - - [9600, 1280, 1, 2304] + - [68, 0.0] + - - [20512, 1280, 1, 256] + - [69, 0.0] + - - [20512, 1280, 1, 1024] + - [70, 0.0] + - - [5120, 1792, 1, 256] + - [71, 0.0] + - - [8224, 1792, 1, 256] + - [72, 0.0] + - - [8224, 1792, 1, 1024] + - [73, 0.0] + - - [13184, 1792, 1, 1024] + - [74, 0.0] + - - [48760, 3072, 1, 256] + - [75, 0.0] + - - [68032, 1280, 1, 256] + - [76, 0.0] + - - [68032, 1280, 1, 1024] + - [77, 0.0] + - - [160, 257, 1792, 32] + - [78, 0.0] + - - [192, 2126, 1280, 32] + - [79, 0.0] + - - [120, 102, 3072, 32] + - [80, 0.0] + - - [105, 160, 3072, 40] + - [81, 0.0] + - - [160, 105, 3072, 40] + - [82, 0.0] + - - [120, 3072, 1, 64] + - [83, 0.0] + - - [120, 3072, 1, 72] + - [84, 0.0] + - - [128256, 32768, 1, 4096] + - [85, 0.0] + - - [4096, 4096, 1, 4096] + - [86, 0.0] + - - [4096, 22080, 1, 4096] + - [87, 0.0] + - - [16384, 4096, 1, 4096] + - [88, 0.0] + - - [102, 120, 3072, 32] + - [89, 0.0] + - - [120, 618, 3072, 32] + - [90, 0.0] + - - [618, 120, 3072, 32] + - [91, 0.0] + - - [256, 232, 1280, 256] + - [92, 0.0] + - - [256, 532, 1280, 256] + - [93, 0.0] + - - [192, 641, 1024, 32] + - [94, 0.0] + - - [192, 2126, 1024, 32] + - [95, 0.0] + - - [641, 192, 1024, 32] + - [96, 0.0] + - - [2126, 192, 1024, 32] + - [97, 0.0] + - - [160, 1867, 1792, 32] + - [98, 0.0] + - - [1867, 160, 1792, 32] + - [99, 0.0] + - - [1, 1792, 1, 512] + - [100, 0.0] + - - [32, 1792, 1, 128] + - [101, 0.0] + - - [128, 1792, 1, 2048] + - [102, 0.0] + - - [256, 1280, 1, 5692] + - [944, 0.0] + - - [1536, 1280, 1, 5692] + - [103, 0.0] + - - [5692, 1280, 1, 256] + - [104, 0.0] + - - [15, 368640, 1, 72] + - [105, 0.0] + - - [32, 368640, 1, 102] + - [106, 0.0] + - - [54, 368640, 1, 618] + - [107, 0.0] + - - [118, 368640, 1, 102] + - [108, 0.0] + - - [128, 3072, 1, 48] + - [109, 0.0] + - - [128, 3072, 1, 60] + - [110, 0.0] + - - [128, 3072, 1, 360] + - [111, 0.0] + - - [128, 3072, 1, 1320] + - [112, 0.0] + - - [240, 3072, 1, 40] + - [113, 0.0] + - - [240, 3072, 1, 48] + - [114, 0.0] + - - [240, 3072, 1, 60] + - [115, 0.0] + - - [240, 3072, 1, 360] + - [1260, 0.0] + - - [240, 3072, 1, 1320] + - [116, 0.0] + - - [256, 3072, 1, 768] + - [117, 0.0] + - - [256, 3072, 1, 1800] + - [118, 0.0] + - - [304, 368640, 1, 618] + - [1266, 0.0] + - - [480, 3072, 1, 240] + - [119, 0.0] + - - [504, 3072, 1, 256] + - [120, 0.0] + - - [768, 3072, 1, 2560] + - [121, 0.0] + - - [1024, 3072, 1, 2048] + - [122, 0.0] + - - [2048, 3072, 1, 768] + - [123, 0.0] + - - [2048, 3072, 1, 1024] + - [124, 0.0] + - - [2560, 3072, 1, 768] + - [125, 0.0] + - - [3840, 3072, 1, 256] + - [126, 0.0] + - - [19776, 3072, 1, 256] + - [127, 0.0] + - - [32, 286720, 1, 264] + - [128, 0.0] + - - [192, 1280, 1, 512] + - [129, 0.0] + - - [512, 1280, 1, 32] + - [130, 0.0] + - - [512, 1280, 1, 60] + - [131, 0.0] + - - [512, 1280, 1, 64] + - [132, 0.0] + - - [512, 1280, 1, 72] + - [133, 0.0] + - - [512, 1280, 1, 96] + - [134, 0.0] + - - [512, 1280, 1, 120] + - [135, 0.0] + - - [512, 1280, 1, 144] + - [136, 0.0] + - - [512, 1280, 1, 160] + - [137, 0.0] + - - [512, 1280, 1, 192] + - [138, 0.0] + - - [1562, 1280, 1, 512] + - [139, 0.0] + - - [160, 1792, 1, 512] + - [140, 0.0] + - - [160, 1792, 1, 2048] + - [141, 0.0] + - - [512, 1792, 1, 60] + - [142, 0.0] + - - [512, 1792, 1, 64] + - [143, 0.0] + - - [512, 1792, 1, 72] + - [144, 0.0] + - - [512, 1792, 1, 96] + - [145, 0.0] + - - [2895, 1792, 1, 512] + - [146, 0.0] + - - [256, 450560, 1, 192] + - [147, 0.0] + - - [256, 296960, 1, 256] + - [148, 0.0] + - - [256, 680960, 1, 256] + - [149, 0.0] + - - [384, 57600, 1, 256] + - [150, 0.0] + - - [512, 40960, 1, 256] + - [151, 0.0] + - - [512, 296960, 1, 256] + - [152, 0.0] + - - [512, 680960, 1, 256] + - [153, 0.0] + - - [65536, 1280, 1, 256] + - [154, 0.0] + - - [120, 1024, 1, 72] + - [155, 0.0] + - - [120, 1024, 1, 120] + - [156, 0.0] + - - [120, 1024, 1, 144] + - [157, 0.0] + - - [120, 1024, 1, 160] + - [158, 0.0] + - - [120, 1024, 1, 240] + - [159, 0.0] + - - [120, 1024, 1, 512] + - [160, 0.0] + - - [125, 1024, 1, 320] + - [161, 0.0] + - - [256, 1024, 1, 7524] + - [162, 0.0] + - - [512, 1024, 1, 3000] + - [1272, 0.0] + - - [7524, 1024, 1, 256] + - [163, 0.0] + - - [128, 1792, 1, 112] + - [164, 0.0] + - - [128, 1792, 1, 2352] + - [165, 0.0] + - - [128, 1792, 1, 20224] + - [166, 0.0] + - - [160, 1792, 1, 804] + - [167, 0.0] + - - [320, 1792, 1, 112] + - [168, 0.0] + - - [320, 1792, 1, 380] + - [169, 0.0] + - - [320, 1792, 1, 804] + - [170, 0.0] + - - [320, 1792, 1, 2352] + - [171, 0.0] + - - [320, 1792, 1, 20224] + - [1267, 0.0] + - - [1, 3072, 1, 512] + - [172, 0.0] + - - [32, 3072, 1, 2048] + - [173, 0.0] + - - [256, 1024, 1, 256] + - [174, 0.0] + - - [256, 1024, 1, 49152] + - [175, 0.0] + - - [256, 196608, 1, 648] + - [176, 0.0] + - - [384, 196608, 1, 641] + - [177, 0.0] + - - [1024, 1024, 1, 5632] + - [951, 0.0] + - - [1032, 196608, 1, 2128] + - [178, 0.0] + - - [2048, 1024, 1, 20512] + - [179, 0.0] + - - [2048, 1024, 1, 68032] + - [180, 0.0] + - - [2304, 1024, 1, 9600] + - [181, 0.0] + - - [5632, 1024, 1, 1024] + - [182, 0.0] + - - [9600, 1024, 1, 2304] + - [183, 0.0] + - - [9600, 1024, 1, 5632] + - [184, 0.0] + - - [20512, 1024, 1, 256] + - [185, 0.0] + - - [20512, 1024, 1, 1024] + - [186, 0.0] + - - [68032, 1024, 1, 256] + - [187, 0.0] + - - [68032, 1024, 1, 1024] + - [188, 0.0] + - - [417, 286720, 1, 1867] + - [1271, 0.0] + - - [59744, 1792, 1, 256] + - [189, 0.0] + - - [59744, 1792, 1, 1024] + - [190, 0.0] + - - [64704, 1792, 1, 1024] + - [191, 0.0] + - - [352, 245760, 1, 3595] + - [1269, 0.0] + - - [10, 368640, 1, 10] + - [192, 0.0] + - - [10, 368640, 1, 221] + - [193, 0.0] + - - [256, 3072, 1, 2722] + - [194, 0.0] + - - [512, 3072, 1, 256] + - [195, 0.0] + - - [1024, 3072, 1, 18992] + - [196, 0.0] + - - [4800, 3072, 1, 1024] + - [197, 0.0] + - - [1, 1792, 1, 128] + - [198, 0.0] + - - [128, 358400, 1, 128] + - [199, 0.0] + - - [128, 57344, 1, 128] + - [200, 0.0] + - - [128, 57344, 1, 256] + - [201, 0.0] + - - [256, 57344, 1, 128] + - [202, 0.0] + - - [128, 1792, 128, 160] + - [203, 0.0] + - - [256, 1280, 1, 768] + - [204, 0.0] + - - [256, 1280, 1, 1664] + - [205, 0.0] + - - [256, 26480, 1, 640] + - [206, 0.0] + - - [256, 189360, 1, 512] + - [207, 0.0] + - - [256, 248960, 1, 1024] + - [208, 0.0] + - - [256, 256000, 1, 256] + - [209, 0.0] + - - [256, 600640, 1, 640] + - [210, 0.0] + - - [256, 640000, 1, 256] + - [211, 0.0] + - - [512, 1280, 1, 512] + - [212, 0.0] + - - [640, 1280, 1, 640] + - [213, 0.0] + - - [768, 1280, 1, 768] + - [214, 0.0] + - - [1024, 1280, 1, 1024] + - [215, 0.0] + - - [1664, 1280, 1, 1664] + - [216, 0.0] + - - [48, 98304, 1, 48] + - [217, 0.0] + - - [48, 98304, 1, 128] + - [218, 0.0] + - - [48, 614400, 1, 48] + - [219, 0.0] + - - [128, 98304, 1, 48] + - [220, 0.0] + - - [48, 3072, 128, 120] + - [221, 0.0] + - - [4, 21907200, 1, 8] + - [222, 0.0] + - - [2, 20920192, 1, 10] + - [223, 0.0] + - - [4, 13744384, 1, 12] + - [224, 0.0] + - - [1, 592704, 1, 12] + - [225, 0.0] + - - [1, 786144, 1, 9] + - [226, 0.0] + - - [1, 875568, 1, 8] + - [227, 0.0] + - - [1, 925632, 1, 10] + - [228, 0.0] + - - [16, 8192, 1, 8192] + - [229, 0.0] + - - [32000, 8192, 1, 8192] + - [230, 0.0] + - - [10240, 8192, 1, 16] + - [231, 0.0] + - - [8192, 8192, 1, 16] + - [232, 0.0] + - - [128, 40, 1, 2880] + - [233, 0.0] + - - [128, 184, 1, 2880] + - [234, 0.0] + - - [128, 232, 1, 2880] + - [235, 0.0] + - - [128, 416, 1, 2880] + - [236, 0.0] + - - [128, 576, 1, 2880] + - [237, 0.0] + - - [128, 768, 1, 2880] + - [238, 0.0] + - - [128, 880, 1, 2880] + - [239, 0.0] + - - [128, 1206, 1, 2880] + - [240, 0.0] + - - [128, 6146, 1, 2880] + - [241, 0.0] + - - [640, 16, 1, 2880] + - [242, 0.0] + - - [640, 48, 1, 2880] + - [243, 0.0] + - - [640, 64, 1, 2880] + - [244, 0.0] + - - [640, 176, 1, 2880] + - [245, 0.0] + - - [640, 288, 1, 2880] + - [246, 0.0] + - - [640, 320, 1, 2880] + - [247, 0.0] + - - [640, 512, 1, 2880] + - [248, 0.0] + - - [640, 640, 1, 2880] + - [249, 0.0] + - - [640, 768, 1, 2880] + - [250, 0.0] + - - [640, 912, 1, 2880] + - [251, 0.0] + - - [640, 1206, 1, 2880] + - [252, 0.0] + - - [640, 6146, 1, 2880] + - [253, 0.0] + - - [2880, 4, 1, 4096] + - [254, 0.0] + - - [2880, 1, 1, 4096] + - [254, 0.0] + - - [2880, 2, 1, 4096] + - [254, 0.0] + - - [2880, 48, 1, 512] + - [255, 0.0] + - - [2880, 72, 1, 4096] + - [256, 0.0] + - - [2880, 112, 1, 4096] + - [257, 0.0] + - - [2880, 152, 1, 512] + - [258, 0.0] + - - [2880, 184, 1, 512] + - [259, 0.0] + - - [2880, 208, 1, 4096] + - [260, 0.0] + - - [2880, 232, 1, 512] + - [261, 0.0] + - - [2880, 272, 1, 512] + - [262, 0.0] + - - [2880, 336, 1, 512] + - [263, 0.0] + - - [2880, 400, 1, 512] + - [264, 0.0] + - - [2880, 464, 1, 4096] + - [265, 0.0] + - - [2880, 528, 1, 4096] + - [266, 0.0] + - - [2880, 608, 1, 512] + - [267, 0.0] + - - [2880, 656, 1, 4096] + - [268, 0.0] + - - [2880, 720, 1, 4096] + - [269, 0.0] + - - [2880, 784, 1, 4096] + - [270, 0.0] + - - [2880, 864, 1, 512] + - [271, 0.0] + - - [2880, 912, 1, 4096] + - [272, 0.0] + - - [2880, 976, 1, 4096] + - [273, 0.0] + - - [2880, 1206, 1, 512] + - [274, 0.0] + - - [2880, 7169, 1, 512] + - [275, 0.0] + - - [2880, 7217, 1, 4096] + - [276, 0.0] + - - [5120, 4, 1, 2880] + - [277, 0.0] + - - [5120, 1, 1, 2880] + - [277, 0.0] + - - [5120, 2, 1, 2880] + - [277, 0.0] + - - [5120, 72, 1, 2880] + - [278, 0.0] + - - [5120, 120, 1, 2880] + - [279, 0.0] + - - [5120, 176, 1, 2880] + - [280, 0.0] + - - [5120, 240, 1, 2880] + - [281, 0.0] + - - [5120, 368, 1, 2880] + - [282, 0.0] + - - [5120, 496, 1, 2880] + - [283, 0.0] + - - [5120, 624, 1, 2880] + - [284, 0.0] + - - [5120, 784, 1, 2880] + - [285, 0.0] + - - [5120, 912, 1, 2880] + - [286, 0.0] + - - [5120, 1202, 1, 2880] + - [287, 0.0] + - - [5120, 7217, 1, 2880] + - [288, 0.0] + - - [25136, 5, 1, 2880] + - [289, 0.0] + - - [25136, 15, 1, 2880] + - [290, 0.0] + - - [25136, 30, 1, 2880] + - [291, 0.0] + - - [25136, 46, 1, 2880] + - [292, 0.0] + - - [25136, 62, 1, 2880] + - [293, 0.0] + - - [25136, 85, 1, 2880] + - [294, 0.0] + - - [201088, 8, 1, 2880] + - [295, 0.0] + - - [201088, 17, 1, 2880] + - [296, 0.0] + - - [201088, 33, 1, 2880] + - [297, 0.0] + - - [201088, 64, 1, 2880] + - [298, 0.0] + - - [128, 48, 1, 2880] + - [299, 0.0] + - - [128, 176, 1, 2880] + - [300, 0.0] + - - [128, 248, 1, 2880] + - [301, 0.0] + - - [128, 304, 1, 2880] + - [302, 0.0] + - - [128, 368, 1, 2880] + - [303, 0.0] + - - [128, 496, 1, 2880] + - [304, 0.0] + - - [128, 528, 1, 2880] + - [305, 0.0] + - - [128, 592, 1, 2880] + - [306, 0.0] + - - [128, 752, 1, 2880] + - [307, 0.0] + - - [128, 896, 1, 2880] + - [308, 0.0] + - - [128, 944, 1, 2880] + - [309, 0.0] + - - [128, 1008, 1, 2880] + - [310, 0.0] + - - [128, 2050, 1, 2880] + - [311, 0.0] + - - [128, 8192, 1, 2880] + - [312, 0.0] + - - [640, 96, 1, 2880] + - [313, 0.0] + - - [640, 136, 1, 2880] + - [314, 0.0] + - - [640, 224, 1, 2880] + - [315, 0.0] + - - [640, 272, 1, 2880] + - [316, 0.0] + - - [640, 352, 1, 2880] + - [317, 0.0] + - - [640, 384, 1, 2880] + - [318, 0.0] + - - [640, 528, 1, 2880] + - [319, 0.0] + - - [640, 624, 1, 2880] + - [320, 0.0] + - - [640, 752, 1, 2880] + - [321, 0.0] + - - [640, 880, 1, 2880] + - [322, 0.0] + - - [640, 1008, 1, 2880] + - [323, 0.0] + - - [640, 8183, 1, 2880] + - [324, 0.0] + - - [2880, 24, 1, 512] + - [325, 0.0] + - - [2880, 56, 1, 512] + - [326, 0.0] + - - [2880, 88, 1, 512] + - [327, 0.0] + - - [2880, 104, 1, 4096] + - [328, 0.0] + - - [2880, 136, 1, 512] + - [329, 0.0] + - - [2880, 176, 1, 4096] + - [330, 0.0] + - - [2880, 200, 1, 512] + - [331, 0.0] + - - [2880, 240, 1, 512] + - [332, 0.0] + - - [2880, 272, 1, 4096] + - [333, 0.0] + - - [2880, 352, 1, 4096] + - [334, 0.0] + - - [2880, 416, 1, 512] + - [335, 0.0] + - - [2880, 496, 1, 512] + - [336, 0.0] + - - [2880, 560, 1, 4096] + - [337, 0.0] + - - [2880, 624, 1, 512] + - [338, 0.0] + - - [2880, 688, 1, 4096] + - [339, 0.0] + - - [2880, 768, 1, 512] + - [340, 0.0] + - - [2880, 816, 1, 4096] + - [341, 0.0] + - - [2880, 896, 1, 512] + - [342, 0.0] + - - [2880, 960, 1, 512] + - [343, 0.0] + - - [2880, 1024, 1, 4096] + - [344, 0.0] + - - [2880, 4096, 1, 512] + - [345, 0.0] + - - [2880, 7204, 1, 512] + - [346, 0.0] + - - [2880, 8184, 1, 4096] + - [347, 0.0] + - - [5120, 32, 1, 2880] + - [348, 0.0] + - - [5120, 96, 1, 2880] + - [349, 0.0] + - - [5120, 160, 1, 2880] + - [350, 0.0] + - - [5120, 224, 1, 2880] + - [351, 0.0] + - - [5120, 304, 1, 2880] + - [352, 0.0] + - - [5120, 432, 1, 2880] + - [353, 0.0] + - - [5120, 560, 1, 2880] + - [354, 0.0] + - - [5120, 688, 1, 2880] + - [355, 0.0] + - - [5120, 800, 1, 2880] + - [356, 0.0] + - - [5120, 960, 1, 2880] + - [357, 0.0] + - - [5120, 4096, 1, 2880] + - [358, 0.0] + - - [5120, 8184, 1, 2880] + - [359, 0.0] + - - [25136, 10, 1, 2880] + - [360, 0.0] + - - [25136, 24, 1, 2880] + - [361, 0.0] + - - [25136, 35, 1, 2880] + - [362, 0.0] + - - [25136, 52, 1, 2880] + - [363, 0.0] + - - [25136, 80, 1, 2880] + - [364, 0.0] + - - [201088, 5, 1, 2880] + - [365, 0.0] + - - [201088, 25, 1, 2880] + - [366, 0.0] + - - [201088, 50, 1, 2880] + - [367, 0.0] + - - [201088, 65, 1, 2880] + - [368, 0.0] + - - [128, 56, 1, 2880] + - [369, 0.0] + - - [128, 144, 1, 2880] + - [370, 0.0] + - - [128, 208, 1, 2880] + - [371, 0.0] + - - [128, 240, 1, 2880] + - [372, 0.0] + - - [128, 480, 1, 2880] + - [373, 0.0] + - - [128, 624, 1, 2880] + - [374, 0.0] + - - [128, 672, 1, 2880] + - [375, 0.0] + - - [128, 736, 1, 2880] + - [376, 0.0] + - - [128, 912, 1, 2880] + - [377, 0.0] + - - [128, 1881, 1, 2880] + - [378, 0.0] + - - [128, 8183, 1, 2880] + - [379, 0.0] + - - [640, 8, 1, 2880] + - [380, 0.0] + - - [640, 32, 1, 2880] + - [381, 0.0] + - - [640, 128, 1, 2880] + - [382, 0.0] + - - [640, 160, 1, 2880] + - [383, 0.0] + - - [640, 256, 1, 2880] + - [384, 0.0] + - - [640, 368, 1, 2880] + - [385, 0.0] + - - [640, 400, 1, 2880] + - [386, 0.0] + - - [640, 496, 1, 2880] + - [387, 0.0] + - - [640, 656, 1, 2880] + - [388, 0.0] + - - [640, 784, 1, 2880] + - [389, 0.0] + - - [640, 896, 1, 2880] + - [390, 0.0] + - - [640, 1024, 1, 2880] + - [391, 0.0] + - - [640, 7169, 1, 2880] + - [392, 0.0] + - - [2880, 16, 1, 512] + - [393, 0.0] + - - [2880, 48, 1, 4096] + - [394, 0.0] + - - [2880, 80, 1, 4096] + - [395, 0.0] + - - [2880, 112, 1, 512] + - [396, 0.0] + - - [2880, 136, 1, 4096] + - [397, 0.0] + - - [2880, 168, 1, 4096] + - [398, 0.0] + - - [2880, 208, 1, 512] + - [399, 0.0] + - - [2880, 232, 1, 4096] + - [400, 0.0] + - - [2880, 288, 1, 4096] + - [401, 0.0] + - - [2880, 336, 1, 4096] + - [402, 0.0] + - - [2880, 400, 1, 4096] + - [403, 0.0] + - - [2880, 496, 1, 4096] + - [404, 0.0] + - - [2880, 560, 1, 512] + - [405, 0.0] + - - [2880, 624, 1, 4096] + - [406, 0.0] + - - [2880, 688, 1, 512] + - [407, 0.0] + - - [2880, 736, 1, 4096] + - [408, 0.0] + - - [2880, 800, 1, 512] + - [409, 0.0] + - - [2880, 880, 1, 512] + - [410, 0.0] + - - [2880, 944, 1, 512] + - [411, 0.0] + - - [2880, 1008, 1, 512] + - [412, 0.0] + - - [2880, 2049, 1, 512] + - [413, 0.0] + - - [2880, 7169, 1, 4096] + - [414, 0.0] + - - [2880, 8183, 1, 4096] + - [415, 0.0] + - - [5120, 16, 1, 2880] + - [416, 0.0] + - - [5120, 80, 1, 2880] + - [417, 0.0] + - - [5120, 136, 1, 2880] + - [418, 0.0] + - - [5120, 208, 1, 2880] + - [419, 0.0] + - - [5120, 288, 1, 2880] + - [420, 0.0] + - - [5120, 416, 1, 2880] + - [421, 0.0] + - - [5120, 544, 1, 2880] + - [422, 0.0] + - - [5120, 672, 1, 2880] + - [423, 0.0] + - - [5120, 832, 1, 2880] + - [424, 0.0] + - - [5120, 928, 1, 2880] + - [425, 0.0] + - - [5120, 2050, 1, 2880] + - [426, 0.0] + - - [5120, 8189, 1, 2880] + - [427, 0.0] + - - [25136, 11, 1, 2880] + - [428, 0.0] + - - [25136, 25, 1, 2880] + - [429, 0.0] + - - [25136, 40, 1, 2880] + - [430, 0.0] + - - [25136, 56, 1, 2880] + - [431, 0.0] + - - [25136, 77, 1, 2880] + - [432, 0.0] + - - [201088, 6, 1, 2880] + - [433, 0.0] + - - [201088, 16, 1, 2880] + - [434, 0.0] + - - [201088, 56, 1, 2880] + - [435, 0.0] + - - [201088, 80, 1, 2880] + - [436, 0.0] + - - [128, 72, 1, 2880] + - [437, 0.0] + - - [128, 192, 1, 2880] + - [438, 0.0] + - - [128, 352, 1, 2880] + - [439, 0.0] + - - [128, 688, 1, 2880] + - [440, 0.0] + - - [128, 800, 1, 2880] + - [441, 0.0] + - - [128, 832, 1, 2880] + - [442, 0.0] + - - [128, 864, 1, 2880] + - [443, 0.0] + - - [128, 1024, 1, 2880] + - [444, 0.0] + - - [128, 2049, 1, 2880] + - [445, 0.0] + - - [128, 5790, 1, 2880] + - [446, 0.0] + - - [128, 7204, 1, 2880] + - [447, 0.0] + - - [128, 7217, 1, 2880] + - [448, 0.0] + - - [128, 8184, 1, 2880] + - [449, 0.0] + - - [640, 4, 1, 2880] + - [450, 0.0] + - - [640, 1, 1, 2880] + - [450, 0.0] + - - [640, 2, 1, 2880] + - [450, 0.0] + - - [640, 104, 1, 2880] + - [451, 0.0] + - - [640, 192, 1, 2880] + - [452, 0.0] + - - [640, 208, 1, 2880] + - [453, 0.0] + - - [640, 232, 1, 2880] + - [454, 0.0] + - - [640, 416, 1, 2880] + - [455, 0.0] + - - [640, 560, 1, 2880] + - [456, 0.0] + - - [640, 688, 1, 2880] + - [457, 0.0] + - - [640, 800, 1, 2880] + - [458, 0.0] + - - [640, 928, 1, 2880] + - [459, 0.0] + - - [640, 1881, 1, 2880] + - [460, 0.0] + - - [640, 7204, 1, 2880] + - [461, 0.0] + - - [2880, 16, 1, 4096] + - [462, 0.0] + - - [2880, 40, 1, 512] + - [463, 0.0] + - - [2880, 72, 1, 512] + - [464, 0.0] + - - [2880, 120, 1, 512] + - [465, 0.0] + - - [2880, 144, 1, 4096] + - [466, 0.0] + - - [2880, 168, 1, 512] + - [467, 0.0] + - - [2880, 200, 1, 4096] + - [468, 0.0] + - - [2880, 240, 1, 4096] + - [469, 0.0] + - - [2880, 304, 1, 512] + - [470, 0.0] + - - [2880, 368, 1, 512] + - [471, 0.0] + - - [2880, 416, 1, 4096] + - [472, 0.0] + - - [2880, 480, 1, 512] + - [473, 0.0] + - - [2880, 528, 1, 512] + - [474, 0.0] + - - [2880, 592, 1, 4096] + - [475, 0.0] + - - [2880, 656, 1, 512] + - [476, 0.0] + - - [2880, 720, 1, 512] + - [477, 0.0] + - - [2880, 784, 1, 512] + - [478, 0.0] + - - [2880, 848, 1, 512] + - [479, 0.0] + - - [2880, 912, 1, 512] + - [480, 0.0] + - - [2880, 976, 1, 512] + - [481, 0.0] + - - [2880, 1202, 1, 4096] + - [482, 0.0] + - - [2880, 4096, 1, 4096] + - [483, 0.0] + - - [2880, 7821, 1, 4096] + - [484, 0.0] + - - [2880, 8192, 1, 512] + - [485, 0.0] + - - [5120, 56, 1, 2880] + - [486, 0.0] + - - [5120, 112, 1, 2880] + - [487, 0.0] + - - [5120, 184, 1, 2880] + - [488, 0.0] + - - [5120, 256, 1, 2880] + - [489, 0.0] + - - [5120, 352, 1, 2880] + - [490, 0.0] + - - [5120, 480, 1, 2880] + - [491, 0.0] + - - [5120, 608, 1, 2880] + - [492, 0.0] + - - [5120, 736, 1, 2880] + - [493, 0.0] + - - [5120, 864, 1, 2880] + - [494, 0.0] + - - [5120, 1024, 1, 2880] + - [495, 0.0] + - - [5120, 7821, 1, 2880] + - [496, 0.0] + - - [25136, 8, 1, 2880] + - [497, 0.0] + - - [25136, 16, 1, 2880] + - [497, 0.0] + - - [25136, 31, 1, 2880] + - [498, 0.0] + - - [25136, 48, 1, 2880] + - [499, 0.0] + - - [25136, 63, 1, 2880] + - [500, 0.0] + - - [25136, 89, 1, 2880] + - [501, 0.0] + - - [201088, 12, 1, 2880] + - [502, 0.0] + - - [201088, 32, 1, 2880] + - [503, 0.0] + - - [201088, 63, 1, 2880] + - [504, 0.0] + - - [128, 80, 1, 2880] + - [505, 0.0] + - - [128, 120, 1, 2880] + - [506, 0.0] + - - [128, 168, 1, 2880] + - [507, 0.0] + - - [128, 224, 1, 2880] + - [508, 0.0] + - - [128, 336, 1, 2880] + - [509, 0.0] + - - [128, 432, 1, 2880] + - [510, 0.0] + - - [128, 464, 1, 2880] + - [511, 0.0] + - - [128, 560, 1, 2880] + - [512, 0.0] + - - [128, 608, 1, 2880] + - [513, 0.0] + - - [128, 640, 1, 2880] + - [514, 0.0] + - - [128, 960, 1, 2880] + - [515, 0.0] + - - [128, 3073, 1, 2880] + - [516, 0.0] + - - [128, 7177, 1, 2880] + - [517, 0.0] + - - [128, 7821, 1, 2880] + - [518, 0.0] + - - [128, 8189, 1, 2880] + - [519, 0.0] + - - [640, 72, 1, 2880] + - [520, 0.0] + - - [640, 80, 1, 2880] + - [521, 0.0] + - - [640, 184, 1, 2880] + - [522, 0.0] + - - [640, 200, 1, 2880] + - [523, 0.0] + - - [640, 432, 1, 2880] + - [524, 0.0] + - - [640, 544, 1, 2880] + - [525, 0.0] + - - [640, 672, 1, 2880] + - [526, 0.0] + - - [640, 816, 1, 2880] + - [527, 0.0] + - - [640, 944, 1, 2880] + - [528, 0.0] + - - [640, 2049, 1, 2880] + - [529, 0.0] + - - [640, 5790, 1, 2880] + - [530, 0.0] + - - [640, 8192, 1, 2880] + - [531, 0.0] + - - [2880, 24, 1, 4096] + - [532, 0.0] + - - [2880, 56, 1, 4096] + - [533, 0.0] + - - [2880, 88, 1, 4096] + - [534, 0.0] + - - [2880, 120, 1, 4096] + - [535, 0.0] + - - [2880, 152, 1, 4096] + - [536, 0.0] + - - [2880, 184, 1, 4096] + - [537, 0.0] + - - [2880, 216, 1, 4096] + - [538, 0.0] + - - [2880, 248, 1, 4096] + - [539, 0.0] + - - [2880, 304, 1, 4096] + - [540, 0.0] + - - [2880, 368, 1, 4096] + - [541, 0.0] + - - [2880, 432, 1, 512] + - [542, 0.0] + - - [2880, 464, 1, 512] + - [543, 0.0] + - - [2880, 544, 1, 4096] + - [544, 0.0] + - - [2880, 608, 1, 4096] + - [545, 0.0] + - - [2880, 672, 1, 4096] + - [546, 0.0] + - - [2880, 736, 1, 512] + - [547, 0.0] + - - [2880, 800, 1, 4096] + - [548, 0.0] + - - [2880, 864, 1, 4096] + - [549, 0.0] + - - [2880, 928, 1, 4096] + - [550, 0.0] + - - [2880, 992, 1, 4096] + - [551, 0.0] + - - [2880, 1881, 1, 512] + - [552, 0.0] + - - [2880, 6146, 1, 512] + - [553, 0.0] + - - [2880, 8183, 1, 512] + - [554, 0.0] + - - [5120, 8, 1, 2880] + - [555, 0.0] + - - [5120, 48, 1, 2880] + - [556, 0.0] + - - [5120, 144, 1, 2880] + - [557, 0.0] + - - [5120, 200, 1, 2880] + - [558, 0.0] + - - [5120, 272, 1, 2880] + - [559, 0.0] + - - [5120, 400, 1, 2880] + - [560, 0.0] + - - [5120, 512, 1, 2880] + - [561, 0.0] + - - [5120, 640, 1, 2880] + - [562, 0.0] + - - [5120, 752, 1, 2880] + - [563, 0.0] + - - [5120, 896, 1, 2880] + - [564, 0.0] + - - [5120, 992, 1, 2880] + - [565, 0.0] + - - [5120, 7177, 1, 2880] + - [566, 0.0] + - - [25136, 7, 1, 2880] + - [567, 0.0] + - - [25136, 17, 1, 2880] + - [568, 0.0] + - - [25136, 32, 1, 2880] + - [569, 0.0] + - - [25136, 49, 1, 2880] + - [570, 0.0] + - - [25136, 66, 1, 2880] + - [571, 0.0] + - - [25136, 88, 1, 2880] + - [572, 0.0] + - - [201088, 10, 1, 2880] + - [573, 0.0] + - - [201088, 15, 1, 2880] + - [574, 0.0] + - - [201088, 40, 1, 2880] + - [575, 0.0] + - - [201088, 71, 1, 2880] + - [576, 0.0] + - - [201088, 87, 1, 2880] + - [577, 0.0] + - - [128, 88, 1, 2880] + - [578, 0.0] + - - [128, 136, 1, 2880] + - [579, 0.0] + - - [128, 288, 1, 2880] + - [580, 0.0] + - - [128, 320, 1, 2880] + - [581, 0.0] + - - [128, 400, 1, 2880] + - [582, 0.0] + - - [128, 544, 1, 2880] + - [583, 0.0] + - - [128, 720, 1, 2880] + - [584, 0.0] + - - [128, 848, 1, 2880] + - [585, 0.0] + - - [128, 928, 1, 2880] + - [586, 0.0] + - - [128, 7169, 1, 2880] + - [587, 0.0] + - - [640, 24, 1, 2880] + - [588, 0.0] + - - [640, 112, 1, 2880] + - [589, 0.0] + - - [640, 144, 1, 2880] + - [590, 0.0] + - - [640, 168, 1, 2880] + - [591, 0.0] + - - [640, 480, 1, 2880] + - [592, 0.0] + - - [640, 592, 1, 2880] + - [593, 0.0] + - - [640, 720, 1, 2880] + - [594, 0.0] + - - [640, 864, 1, 2880] + - [595, 0.0] + - - [640, 992, 1, 2880] + - [596, 0.0] + - - [640, 7215, 1, 2880] + - [597, 0.0] + - - [2880, 8, 1, 512] + - [598, 0.0] + - - [2880, 32, 1, 4096] + - [599, 0.0] + - - [2880, 64, 1, 4096] + - [600, 0.0] + - - [2880, 96, 1, 4096] + - [601, 0.0] + - - [2880, 128, 1, 4096] + - [602, 0.0] + - - [2880, 160, 1, 4096] + - [603, 0.0] + - - [2880, 192, 1, 4096] + - [604, 0.0] + - - [2880, 224, 1, 4096] + - [605, 0.0] + - - [2880, 256, 1, 4096] + - [606, 0.0] + - - [2880, 320, 1, 4096] + - [607, 0.0] + - - [2880, 384, 1, 4096] + - [608, 0.0] + - - [2880, 448, 1, 4096] + - [609, 0.0] + - - [2880, 512, 1, 4096] + - [610, 0.0] + - - [2880, 576, 1, 4096] + - [611, 0.0] + - - [2880, 640, 1, 4096] + - [612, 0.0] + - - [2880, 704, 1, 4096] + - [613, 0.0] + - - [2880, 768, 1, 4096] + - [614, 0.0] + - - [2880, 832, 1, 4096] + - [615, 0.0] + - - [2880, 896, 1, 4096] + - [616, 0.0] + - - [2880, 960, 1, 4096] + - [617, 0.0] + - - [2880, 1024, 1, 512] + - [618, 0.0] + - - [2880, 3073, 1, 512] + - [619, 0.0] + - - [2880, 7209, 1, 4096] + - [620, 0.0] + - - [2880, 8189, 1, 4096] + - [621, 0.0] + - - [5120, 40, 1, 2880] + - [622, 0.0] + - - [5120, 104, 1, 2880] + - [623, 0.0] + - - [5120, 168, 1, 2880] + - [624, 0.0] + - - [5120, 232, 1, 2880] + - [625, 0.0] + - - [5120, 336, 1, 2880] + - [626, 0.0] + - - [5120, 464, 1, 2880] + - [627, 0.0] + - - [5120, 592, 1, 2880] + - [628, 0.0] + - - [5120, 704, 1, 2880] + - [629, 0.0] + - - [5120, 816, 1, 2880] + - [630, 0.0] + - - [5120, 976, 1, 2880] + - [631, 0.0] + - - [5120, 1881, 1, 2880] + - [632, 0.0] + - - [5120, 8183, 1, 2880] + - [633, 0.0] + - - [25136, 9, 1, 2880] + - [634, 0.0] + - - [25136, 19, 1, 2880] + - [635, 0.0] + - - [25136, 33, 1, 2880] + - [636, 0.0] + - - [25136, 54, 1, 2880] + - [637, 0.0] + - - [25136, 74, 1, 2880] + - [638, 0.0] + - - [201088, 7, 1, 2880] + - [639, 0.0] + - - [201088, 23, 1, 2880] + - [640, 0.0] + - - [201088, 48, 1, 2880] + - [641, 0.0] + - - [201088, 79, 1, 2880] + - [642, 0.0] + - - [128, 104, 1, 2880] + - [643, 0.0] + - - [128, 160, 1, 2880] + - [644, 0.0] + - - [128, 200, 1, 2880] + - [645, 0.0] + - - [128, 384, 1, 2880] + - [646, 0.0] + - - [128, 448, 1, 2880] + - [647, 0.0] + - - [128, 656, 1, 2880] + - [648, 0.0] + - - [128, 992, 1, 2880] + - [649, 0.0] + - - [128, 7209, 1, 2880] + - [650, 0.0] + - - [640, 40, 1, 2880] + - [651, 0.0] + - - [640, 152, 1, 2880] + - [652, 0.0] + - - [640, 248, 1, 2880] + - [653, 0.0] + - - [640, 304, 1, 2880] + - [654, 0.0] + - - [640, 336, 1, 2880] + - [655, 0.0] + - - [640, 448, 1, 2880] + - [656, 0.0] + - - [640, 576, 1, 2880] + - [657, 0.0] + - - [640, 704, 1, 2880] + - [658, 0.0] + - - [640, 832, 1, 2880] + - [659, 0.0] + - - [640, 976, 1, 2880] + - [660, 0.0] + - - [640, 4096, 1, 2880] + - [661, 0.0] + - - [2880, 8, 1, 4096] + - [662, 0.0] + - - [2880, 40, 1, 4096] + - [663, 0.0] + - - [2880, 80, 1, 512] + - [664, 0.0] + - - [2880, 104, 1, 512] + - [665, 0.0] + - - [2880, 144, 1, 512] + - [666, 0.0] + - - [2880, 176, 1, 512] + - [667, 0.0] + - - [2880, 216, 1, 512] + - [668, 0.0] + - - [2880, 248, 1, 512] + - [669, 0.0] + - - [2880, 288, 1, 512] + - [670, 0.0] + - - [2880, 352, 1, 512] + - [671, 0.0] + - - [2880, 432, 1, 4096] + - [672, 0.0] + - - [2880, 480, 1, 4096] + - [673, 0.0] + - - [2880, 544, 1, 512] + - [674, 0.0] + - - [2880, 592, 1, 512] + - [675, 0.0] + - - [2880, 672, 1, 512] + - [676, 0.0] + - - [2880, 752, 1, 512] + - [677, 0.0] + - - [2880, 816, 1, 512] + - [678, 0.0] + - - [2880, 848, 1, 4096] + - [679, 0.0] + - - [2880, 928, 1, 512] + - [680, 0.0] + - - [2880, 992, 1, 512] + - [681, 0.0] + - - [2880, 1881, 1, 4096] + - [682, 0.0] + - - [2880, 5790, 1, 512] + - [683, 0.0] + - - [2880, 7215, 1, 512] + - [684, 0.0] + - - [2880, 8192, 1, 4096] + - [685, 0.0] + - - [5120, 64, 1, 2880] + - [686, 0.0] + - - [5120, 128, 1, 2880] + - [687, 0.0] + - - [5120, 192, 1, 2880] + - [688, 0.0] + - - [5120, 248, 1, 2880] + - [689, 0.0] + - - [5120, 384, 1, 2880] + - [690, 0.0] + - - [5120, 528, 1, 2880] + - [691, 0.0] + - - [5120, 656, 1, 2880] + - [692, 0.0] + - - [5120, 768, 1, 2880] + - [693, 0.0] + - - [5120, 880, 1, 2880] + - [694, 0.0] + - - [5120, 1008, 1, 2880] + - [695, 0.0] + - - [5120, 7209, 1, 2880] + - [696, 0.0] + - - [25136, 6, 1, 2880] + - [697, 0.0] + - - [25136, 14, 1, 2880] + - [698, 0.0] + - - [25136, 27, 1, 2880] + - [699, 0.0] + - - [25136, 44, 1, 2880] + - [700, 0.0] + - - [25136, 60, 1, 2880] + - [701, 0.0] + - - [25136, 128, 1, 2880] + - [702, 0.0] + - - [201088, 11, 1, 2880] + - [703, 0.0] + - - [201088, 31, 1, 2880] + - [704, 0.0] + - - [201088, 58, 1, 2880] + - [705, 0.0] + - - [201088, 128, 1, 2880] + - [706, 0.0] + - - [201088, 88, 1, 2880] + - [706, 0.0] + - - [201088, 89, 1, 2880] + - [706, 0.0] + - - [128, 96, 1, 2880] + - [707, 0.0] + - - [128, 112, 1, 2880] + - [708, 0.0] + - - [128, 152, 1, 2880] + - [709, 0.0] + - - [128, 216, 1, 2880] + - [710, 0.0] + - - [128, 272, 1, 2880] + - [711, 0.0] + - - [128, 512, 1, 2880] + - [712, 0.0] + - - [128, 704, 1, 2880] + - [713, 0.0] + - - [128, 784, 1, 2880] + - [714, 0.0] + - - [128, 816, 1, 2880] + - [715, 0.0] + - - [128, 976, 1, 2880] + - [716, 0.0] + - - [128, 1202, 1, 2880] + - [717, 0.0] + - - [128, 7215, 1, 2880] + - [718, 0.0] + - - [640, 56, 1, 2880] + - [719, 0.0] + - - [640, 88, 1, 2880] + - [720, 0.0] + - - [640, 120, 1, 2880] + - [721, 0.0] + - - [640, 216, 1, 2880] + - [722, 0.0] + - - [640, 240, 1, 2880] + - [723, 0.0] + - - [640, 464, 1, 2880] + - [724, 0.0] + - - [640, 608, 1, 2880] + - [725, 0.0] + - - [640, 736, 1, 2880] + - [726, 0.0] + - - [640, 848, 1, 2880] + - [727, 0.0] + - - [640, 960, 1, 2880] + - [728, 0.0] + - - [640, 3073, 1, 2880] + - [729, 0.0] + - - [640, 8184, 1, 2880] + - [730, 0.0] + - - [2880, 32, 1, 512] + - [731, 0.0] + - - [2880, 64, 1, 512] + - [732, 0.0] + - - [2880, 96, 1, 512] + - [733, 0.0] + - - [2880, 128, 1, 512] + - [734, 0.0] + - - [2880, 160, 1, 512] + - [735, 0.0] + - - [2880, 192, 1, 512] + - [736, 0.0] + - - [2880, 224, 1, 512] + - [737, 0.0] + - - [2880, 256, 1, 512] + - [738, 0.0] + - - [2880, 320, 1, 512] + - [739, 0.0] + - - [2880, 384, 1, 512] + - [740, 0.0] + - - [2880, 448, 1, 512] + - [741, 0.0] + - - [2880, 512, 1, 512] + - [742, 0.0] + - - [2880, 576, 1, 512] + - [743, 0.0] + - - [2880, 640, 1, 512] + - [744, 0.0] + - - [2880, 704, 1, 512] + - [745, 0.0] + - - [2880, 752, 1, 4096] + - [746, 0.0] + - - [2880, 832, 1, 512] + - [747, 0.0] + - - [2880, 880, 1, 4096] + - [748, 0.0] + - - [2880, 944, 1, 4096] + - [749, 0.0] + - - [2880, 1008, 1, 4096] + - [750, 0.0] + - - [2880, 2050, 1, 4096] + - [751, 0.0] + - - [2880, 7177, 1, 4096] + - [752, 0.0] + - - [2880, 8184, 1, 512] + - [753, 0.0] + - - [5120, 24, 1, 2880] + - [754, 0.0] + - - [5120, 88, 1, 2880] + - [755, 0.0] + - - [5120, 152, 1, 2880] + - [756, 0.0] + - - [5120, 216, 1, 2880] + - [757, 0.0] + - - [5120, 320, 1, 2880] + - [758, 0.0] + - - [5120, 448, 1, 2880] + - [759, 0.0] + - - [5120, 576, 1, 2880] + - [760, 0.0] + - - [5120, 720, 1, 2880] + - [761, 0.0] + - - [5120, 848, 1, 2880] + - [762, 0.0] + - - [5120, 944, 1, 2880] + - [763, 0.0] + - - [5120, 7169, 1, 2880] + - [764, 0.0] + - - [5120, 8192, 1, 2880] + - [765, 0.0] + - - [25136, 12, 1, 2880] + - [766, 0.0] + - - [25136, 22, 1, 2880] + - [767, 0.0] + - - [25136, 38, 1, 2880] + - [768, 0.0] + - - [25136, 57, 1, 2880] + - [769, 0.0] + - - [25136, 69, 1, 2880] + - [770, 0.0] + - - [201088, 9, 1, 2880] + - [771, 0.0] + - - [201088, 24, 1, 2880] + - [772, 0.0] + - - [201088, 42, 1, 2880] + - [773, 0.0] + - - [201088, 72, 1, 2880] + - [774, 0.0] + - - [20, 2048, 1, 3840] + - [775, 0.0] + - - [20, 10240, 1, 3840] + - [776, 0.0] + - - [20, 18432, 1, 3840] + - [777, 0.0] + - - [20, 34816, 1, 3840] + - [778, 0.0] + - - [32, 2048, 1, 2048] + - [779, 0.0] + - - [32, 2048, 1, 4096] + - [780, 0.0] + - - [32, 4096, 1, 1024] + - [781, 0.0] + - - [32, 4096, 1, 1536] + - [782, 0.0] + - - [32, 4096, 1, 4096] + - [783, 0.0] + - - [32, 6144, 1, 768] + - [784, 0.0] + - - [64, 40960, 1, 2048] + - [785, 0.0] + - - [256, 1536, 1, 2048] + - [786, 0.0] + - - [304, 12288, 1, 512] + - [787, 0.0] + - - [512, 12288, 1, 304] + - [788, 0.0] + - - [512, 12288, 1, 512] + - [789, 0.0] + - - [512, 12288, 1, 2048] + - [790, 0.0] + - - [512, 32768, 1, 4096] + - [791, 0.0] + - - [576, 16, 1, 576] + - [792, 0.0] + - - [576, 16, 1, 2304] + - [793, 0.0] + - - [576, 264, 1, 576] + - [794, 0.0] + - - [576, 264, 1, 2304] + - [795, 0.0] + - - [576, 2048, 1, 576] + - [796, 0.0] + - - [576, 2048, 1, 2304] + - [797, 0.0] + - - [576, 2048, 1, 3840] + - [798, 0.0] + - - [576, 12288, 1, 576] + - [799, 0.0] + - - [576, 12288, 1, 3840] + - [800, 0.0] + - - [1024, 8, 1, 512] + - [801, 0.0] + - - [1024, 8, 1, 1024] + - [802, 0.0] + - - [1152, 2048, 1, 576] + - [803, 0.0] + - - [1440, 1440, 1, 384] + - [804, 0.0] + - - [1728, 264, 1, 576] + - [805, 0.0] + - - [1728, 2048, 1, 576] + - [806, 0.0] + - - [1728, 12288, 1, 576] + - [807, 0.0] + - - [2048, 2056, 1, 2048] + - [808, 0.0] + - - [2048, 2056, 1, 3840] + - [809, 0.0] + - - [2048, 2056, 1, 8192] + - [810, 0.0] + - - [2048, 12288, 1, 512] + - [811, 0.0] + - - [2048, 12288, 1, 1536] + - [812, 0.0] + - - [2304, 16, 1, 576] + - [813, 0.0] + - - [2304, 264, 1, 576] + - [814, 0.0] + - - [2304, 2048, 1, 576] + - [815, 0.0] + - - [2816, 30720, 1, 2048] + - [816, 0.0] + - - [3840, 256, 1, 3840] + - [817, 0.0] + - - [3840, 512, 1, 3840] + - [818, 0.0] + - - [3840, 768, 1, 3840] + - [819, 0.0] + - - [3840, 1024, 1, 3840] + - [820, 0.0] + - - [3840, 1280, 1, 3840] + - [821, 0.0] + - - [3840, 1536, 1, 3840] + - [822, 0.0] + - - [3840, 1792, 1, 3840] + - [823, 0.0] + - - [3840, 2048, 1, 576] + - [824, 0.0] + - - [3840, 2048, 1, 3840] + - [825, 0.0] + - - [3840, 2056, 1, 2048] + - [826, 0.0] + - - [3840, 2560, 1, 3840] + - [827, 0.0] + - - [3840, 2816, 1, 3840] + - [828, 0.0] + - - [3840, 5632, 1, 3840] + - [829, 0.0] + - - [4096, 512, 1, 10240] + - [830, 0.0] + - - [5120, 512, 1, 5120] + - [1190, 0.0] + - - [5120, 520, 1, 5120] + - [831, 0.0] + - - [5120, 5760, 1, 64] + - [832, 0.0] + - - [5120, 11520, 1, 64] + - [833, 0.0] + - - [6240, 6240, 1, 384] + - [834, 0.0] + - - [8192, 2056, 1, 2048] + - [835, 0.0] + - - [100096, 40960, 1, 2048] + - [836, 0.0] + - - [1792, 2896, 1, 512] + - [837, 0.0] + - - [2, 6577472, 1, 9] + - [838, 0.0] + - - [2048, 1280, 1, 68032] + - [954, 0.0] + - - [256, 1280, 1, 384] + - [839, 0.0] + - - [256, 66960, 1, 1024] + - [840, 0.0] + - - [256, 114560, 1, 512] + - [841, 0.0] + - - [256, 136080, 1, 1920] + - [842, 0.0] + - - [256, 194160, 1, 1024] + - [843, 0.0] + - - [384, 1280, 1, 384] + - [844, 0.0] + - - [120, 1024, 1, 96] + - [845, 0.0] + - - [384, 196608, 1, 648] + - [846, 0.0] + - - [98304, 1024, 1, 9600] + - [847, 0.0] + - - [128, 3072, 1, 24] + - [848, 0.0] + - - [160, 3072, 1, 512] + - [849, 0.0] + - - [160, 3072, 1, 888] + - [1259, 0.0] + - - [256, 3072, 1, 24] + - [850, 0.0] + - - [316, 3072, 1, 256] + - [851, 0.0] + - - [320, 3072, 1, 24] + - [852, 0.0] + - - [512, 3072, 1, 32] + - [853, 0.0] + - - [512, 3072, 1, 64] + - [854, 0.0] + - - [512, 3072, 1, 96] + - [855, 0.0] + - - [512, 3072, 1, 128] + - [856, 0.0] + - - [512, 3072, 1, 192] + - [857, 0.0] + - - [512, 3072, 1, 1880] + - [858, 0.0] + - - [2048, 3072, 1, 4480] + - [859, 0.0] + - - [3840, 3072, 1, 2048] + - [860, 0.0] + - - [4480, 3072, 1, 768] + - [861, 0.0] + - - [32, 2048, 1, 64] + - [862, 0.0] + - - [32, 2048, 1, 128] + - [863, 0.0] + - - [64, 2048, 1, 192] + - [864, 0.0] + - - [128, 192, 1, 128] + - [865, 0.0] + - - [128, 2048, 1, 9984] + - [1280, 0.0] + - - [128, 93614, 1, 256] + - [1281, 0.0] + - - [128, 155468, 1, 128] + - [866, 0.0] + - - [128, 184146, 1, 128] + - [1282, 0.0] + - - [192, 192, 1, 192] + - [867, 0.0] + - - [192, 2048, 1, 512] + - [868, 0.0] + - - [192, 88177, 1, 832] + - [1283, 0.0] + - - [192, 110122, 1, 640] + - [869, 0.0] + - - [192, 207816, 1, 256] + - [870, 0.0] + - - [192, 225878, 1, 192] + - [871, 0.0] + - - [192, 262144, 1, 448] + - [872, 0.0] + - - [192, 385620, 1, 384] + - [873, 0.0] + - - [192, 391699, 1, 192] + - [1287, 0.0] + - - [192, 393216, 1, 448] + - [874, 0.0] + - - [192, 524288, 1, 448] + - [1288, 0.0] + - - [192, 559568, 1, 192] + - [1289, 0.0] + - - [192, 690621, 1, 192] + - [1290, 0.0] + - - [256, 192, 1, 256] + - [875, 0.0] + - - [256, 114688, 1, 128] + - [1292, 0.0] + - - [256, 131072, 1, 256] + - [876, 0.0] + - - [256, 262144, 1, 512] + - [877, 0.0] + - - [256, 384937, 1, 832] + - [1295, 0.0] + - - [256, 428780, 1, 448] + - [878, 0.0] + - - [256, 870382, 1, 256] + - [879, 0.0] + - - [256, 874159, 1, 256] + - [880, 0.0] + - - [384, 114688, 1, 128] + - [1296, 0.0] + - - [504, 262144, 1, 512] + - [881, 0.0] + - - [504, 262144, 1, 1414] + - [882, 0.0] + - - [512, 2048, 1, 512] + - [883, 0.0] + - - [512, 2048, 1, 1024] + - [884, 0.0] + - - [512, 2048, 1, 7168] + - [1297, 0.0] + - - [512, 114688, 1, 128] + - [885, 0.0] + - - [512, 262144, 1, 512] + - [886, 0.0] + - - [1024, 2048, 1, 2048] + - [887, 0.0] + - - [1024, 2048, 1, 7168] + - [1301, 0.0] + - - [4096, 4096, 1, 36864] + - [888, 0.0] + - - [9984, 2048, 1, 1024] + - [889, 0.0] + - - [15964, 2048, 1, 256] + - [1307, 0.0] + - - [16384, 2048, 1, 256] + - [890, 0.0] + - - [32768, 2048, 1, 9984] + - [891, 0.0] + - - [131456, 2048, 1, 1024] + - [892, 0.0] + - - [6, 6, 2048, 512] + - [893, 0.0] + - - [6, 6, 2048, 1024] + - [894, 0.0] + - - [4096, 4096, 1, 65728] + - [895, 0.0] + - - [6144, 1024, 1, 6144] + - [896, 0.0] + - - [24576, 1024, 1, 6144] + - [897, 0.0] + - - [6144, 1024, 1, 24576] + - [898, 0.0] + - - [1357, 1024, 48, 128] + - [899, 0.0] + - - [1024, 1024, 48, 128] + - [900, 0.0] + - - [256, 256, 64, 64] + - [901, 0.0] + - - [1024, 1024, 6, 64] + - [902, 0.0] + - - [6144, 65536, 1, 1024] + - [903, 0.0] + - - [6144, 65536, 1, 8] + - [904, 0.0] + - - [64, 2048, 1, 3072] + - [905, 0.0] + - - [512, 65536, 1, 512] + - [906, 0.0] + - - [768, 77, 1, 768] + - [907, 0.0] + - - [768, 77, 1, 3072] + - [908, 0.0] + - - [3072, 32, 1, 3072] + - [909, 0.0] + - - [3072, 32, 1, 4096] + - [910, 0.0] + - - [3072, 32, 1, 12288] + - [911, 0.0] + - - [3072, 77, 1, 768] + - [912, 0.0] + - - [3072, 2048, 1, 64] + - [913, 0.0] + - - [3072, 2048, 1, 3072] + - [914, 0.0] + - - [3072, 2048, 1, 12288] + - [915, 0.0] + - - [3072, 2080, 1, 3072] + - [916, 0.0] + - - [3072, 2080, 1, 15360] + - [917, 0.0] + - - [4096, 256, 1, 4096] + - [918, 0.0] + - - [4096, 256, 1, 10240] + - [919, 0.0] + - - [10240, 256, 1, 4096] + - [920, 0.0] + - - [12288, 32, 1, 3072] + - [921, 0.0] + - - [12288, 2048, 1, 3072] + - [922, 0.0] + - - [12288, 2080, 1, 3072] + - [923, 0.0] + - - [3072, 1, 1, 256] + - [924, 0.0] + - - [3072, 1, 1, 768] + - [925, 0.0] + - - [3072, 1, 1, 3072] + - [926, 0.0] + - - [6144, 1, 1, 3072] + - [927, 0.0] + - - [9216, 1, 1, 3072] + - [928, 0.0] + - - [18432, 1, 1, 3072] + - [929, 0.0] + - - [3840, 2304, 1, 3840] + - [930, 0.0] + - - [128, 18928, 1, 128] + - [1203, 0.0] + - - [128, 32768, 1, 128] + - [931, 0.0] + - - [128, 2119936, 1, 128] + - [932, 0.0] + - - [128, 3670016, 1, 128] + - [933, 0.0] + - - [134, 16800000, 1, 128] + - [934, 0.0] + - - [512, 18928, 1, 128] + - [935, 0.0] + - - [512, 32768, 1, 128] + - [936, 0.0] + - - [1024, 150000, 1, 1024] + - [937, 0.0] + - - [2048, 150000, 1, 1134] + - [938, 0.0] + - - [4096, 150000, 1, 1024] + - [939, 0.0] + - - [4096, 150000, 1, 2268] + - [940, 0.0] + - - [1280, 257, 1, 1280] + - [1132, 0.0] + - - [4096, 512, 1, 4096] + - [1188, 0.0] + - - [5120, 257, 1, 1280] + - [1134, 0.0] + - - [5120, 257, 1, 5120] + - [1135, 0.0] + - - [5120, 512, 1, 4096] + - [1189, 0.0] + - - [5120, 13640, 1, 5120] + - [955, 0.0] + - - [5120, 13640, 1, 13824] + - [956, 0.0] + - - [10240, 512, 1, 4096] + - [1200, 0.0] + - - [512, 512, 64, 64] + - [957, 0.0] + - - [1024, 128, 1, 4096] + - [958, 0.0] + - - [1024, 256, 1, 7168] + - [959, 0.0] + - - [1024, 512, 1, 7168] + - [960, 0.0] + - - [1024, 576, 1, 7168] + - [961, 0.0] + - - [1024, 1024, 1, 7168] + - [962, 0.0] + - - [1024, 1280, 1, 8192] + - [963, 0.0] + - - [1024, 1536, 1, 7168] + - [964, 0.0] + - - [1024, 2304, 1, 4096] + - [965, 0.0] + - - [1024, 2304, 1, 16384] + - [966, 0.0] + - - [1024, 2560, 1, 5120] + - [967, 0.0] + - - [1024, 2560, 1, 8192] + - [968, 0.0] + - - [1024, 3072, 1, 1536] + - [969, 0.0] + - - [1024, 4096, 1, 512] + - [970, 0.0] + - - [1024, 4096, 1, 1024] + - [971, 0.0] + - - [1024, 4096, 1, 2048] + - [972, 0.0] + - - [1024, 4096, 1, 7168] + - [973, 0.0] + - - [1024, 4096, 1, 8192] + - [974, 0.0] + - - [1024, 4608, 1, 7168] + - [975, 0.0] + - - [1024, 4608, 1, 16384] + - [976, 0.0] + - - [1024, 5120, 1, 1024] + - [977, 0.0] + - - [1024, 5120, 1, 2048] + - [978, 0.0] + - - [1024, 5120, 1, 3200] + - [979, 0.0] + - - [1024, 5120, 1, 6400] + - [980, 0.0] + - - [1024, 5120, 1, 8192] + - [981, 0.0] + - - [1024, 5120, 1, 25600] + - [982, 0.0] + - - [1024, 6144, 1, 1536] + - [983, 0.0] + - - [1024, 6400, 1, 5120] + - [984, 0.0] + - - [1024, 7168, 1, 256] + - [985, 0.0] + - - [1024, 7168, 1, 512] + - [986, 0.0] + - - [1024, 7168, 1, 2048] + - [987, 0.0] + - - [1024, 7168, 1, 2304] + - [988, 0.0] + - - [1024, 7168, 1, 4096] + - [989, 0.0] + - - [1024, 7168, 1, 4608] + - [990, 0.0] + - - [1024, 7168, 1, 8192] + - [991, 0.0] + - - [1024, 7168, 1, 16384] + - [992, 0.0] + - - [1024, 7168, 1, 18432] + - [993, 0.0] + - - [1024, 8192, 1, 512] + - [994, 0.0] + - - [1024, 8192, 1, 1024] + - [995, 0.0] + - - [1024, 8192, 1, 2048] + - [996, 0.0] + - - [1024, 8192, 1, 3584] + - [997, 0.0] + - - [1024, 8192, 1, 7168] + - [998, 0.0] + - - [1024, 8192, 1, 8192] + - [999, 0.0] + - - [1024, 8192, 1, 28672] + - [1000, 0.0] + - - [1024, 9216, 1, 4096] + - [1001, 0.0] + - - [1024, 9216, 1, 7168] + - [1002, 0.0] + - - [1024, 10240, 1, 5120] + - [1003, 0.0] + - - [1024, 10240, 1, 8192] + - [1004, 0.0] + - - [1024, 12800, 1, 5120] + - [1005, 0.0] + - - [1024, 13312, 1, 16384] + - [1006, 0.0] + - - [1024, 14336, 1, 8192] + - [1007, 0.0] + - - [1024, 16384, 1, 4096] + - [1008, 0.0] + - - [1024, 16384, 1, 6656] + - [1009, 0.0] + - - [1024, 16384, 1, 13312] + - [1010, 0.0] + - - [1024, 16384, 1, 16384] + - [1011, 0.0] + - - [1024, 16384, 1, 53248] + - [1012, 0.0] + - - [1024, 18432, 1, 16384] + - [1013, 0.0] + - - [1024, 24576, 1, 1536] + - [1014, 0.0] + - - [1024, 26624, 1, 16384] + - [1015, 0.0] + - - [1024, 36864, 1, 7168] + - [1016, 0.0] + - - [1024, 51200, 1, 5120] + - [1017, 0.0] + - - [1024, 57344, 1, 8192] + - [1018, 0.0] + - - [1024, 106496, 1, 16384] + - [1019, 0.0] + - - [2048, 256, 1, 7168] + - [1020, 0.0] + - - [2048, 512, 1, 7168] + - [1021, 0.0] + - - [2048, 576, 1, 7168] + - [1022, 0.0] + - - [2048, 1024, 1, 7168] + - [1023, 0.0] + - - [2048, 1152, 1, 4096] + - [1024, 0.0] + - - [2048, 1280, 1, 5120] + - [1025, 0.0] + - - [2048, 1280, 1, 8192] + - [1026, 0.0] + - - [2048, 1536, 1, 7168] + - [1027, 0.0] + - - [2048, 2304, 1, 4096] + - [1028, 0.0] + - - [2048, 2304, 1, 16384] + - [1029, 0.0] + - - [2048, 2560, 1, 5120] + - [1030, 0.0] + - - [2048, 2560, 1, 8192] + - [1031, 0.0] + - - [2048, 4096, 1, 512] + - [1032, 0.0] + - - [2048, 4096, 1, 1024] + - [1033, 0.0] + - - [2048, 4096, 1, 2048] + - [1034, 0.0] + - - [2048, 4096, 1, 7168] + - [1035, 0.0] + - - [2048, 4096, 1, 8192] + - [1036, 0.0] + - - [2048, 4608, 1, 7168] + - [1037, 0.0] + - - [2048, 4608, 1, 16384] + - [1038, 0.0] + - - [2048, 5120, 1, 1024] + - [1039, 0.0] + - - [2048, 5120, 1, 2048] + - [1040, 0.0] + - - [2048, 5120, 1, 3200] + - [1041, 0.0] + - - [2048, 5120, 1, 6400] + - [1042, 0.0] + - - [2048, 5120, 1, 8192] + - [1043, 0.0] + - - [2048, 5120, 1, 25600] + - [1044, 0.0] + - - [2048, 6144, 1, 1536] + - [1045, 0.0] + - - [2048, 6400, 1, 5120] + - [1046, 0.0] + - - [2048, 7168, 1, 256] + - [1047, 0.0] + - - [2048, 7168, 1, 512] + - [1048, 0.0] + - - [2048, 7168, 1, 2048] + - [1049, 0.0] + - - [2048, 7168, 1, 8192] + - [1050, 0.0] + - - [2048, 7168, 1, 16384] + - [1051, 0.0] + - - [2048, 7168, 1, 18432] + - [1052, 0.0] + - - [2048, 8192, 1, 3584] + - [1053, 0.0] + - - [2048, 8192, 1, 7168] + - [1054, 0.0] + - - [2048, 8192, 1, 8192] + - [1055, 0.0] + - - [2048, 8192, 1, 28672] + - [1056, 0.0] + - - [2048, 9216, 1, 4096] + - [1057, 0.0] + - - [2048, 9216, 1, 7168] + - [1058, 0.0] + - - [2048, 10240, 1, 5120] + - [1059, 0.0] + - - [2048, 10240, 1, 8192] + - [1060, 0.0] + - - [2048, 12800, 1, 5120] + - [1061, 0.0] + - - [2048, 13312, 1, 16384] + - [1062, 0.0] + - - [2048, 14336, 1, 8192] + - [1063, 0.0] + - - [2048, 16384, 1, 2048] + - [1064, 0.0] + - - [2048, 16384, 1, 4096] + - [1065, 0.0] + - - [2048, 16384, 1, 6656] + - [1066, 0.0] + - - [2048, 16384, 1, 13312] + - [1067, 0.0] + - - [2048, 16384, 1, 16384] + - [1068, 0.0] + - - [2048, 16384, 1, 53248] + - [1069, 0.0] + - - [2048, 18432, 1, 16384] + - [1070, 0.0] + - - [2048, 24576, 1, 1536] + - [1071, 0.0] + - - [2048, 26624, 1, 16384] + - [1072, 0.0] + - - [2048, 36864, 1, 7168] + - [1073, 0.0] + - - [2048, 51200, 1, 5120] + - [1074, 0.0] + - - [2048, 106496, 1, 16384] + - [1075, 0.0] + - - [4096, 128, 1, 4096] + - [1076, 0.0] + - - [4096, 256, 1, 7168] + - [1077, 0.0] + - - [4096, 512, 1, 7168] + - [1078, 0.0] + - - [4096, 576, 1, 7168] + - [1079, 0.0] + - - [4096, 1024, 1, 7168] + - [1080, 0.0] + - - [4096, 1152, 1, 4096] + - [1081, 0.0] + - - [4096, 1280, 1, 5120] + - [1082, 0.0] + - - [4096, 1280, 1, 8192] + - [1083, 0.0] + - - [4096, 1536, 1, 7168] + - [1084, 0.0] + - - [4096, 2304, 1, 4096] + - [1085, 0.0] + - - [4096, 2304, 1, 16384] + - [1086, 0.0] + - - [4096, 2560, 1, 8192] + - [1087, 0.0] + - - [4096, 3072, 1, 1536] + - [1088, 0.0] + - - [4096, 4096, 1, 1024] + - [1089, 0.0] + - - [4096, 4096, 1, 7168] + - [1090, 0.0] + - - [4096, 4096, 1, 8192] + - [1091, 0.0] + - - [4096, 4608, 1, 7168] + - [1092, 0.0] + - - [4096, 4608, 1, 16384] + - [1093, 0.0] + - - [4096, 5120, 1, 1024] + - [1094, 0.0] + - - [4096, 5120, 1, 2048] + - [1095, 0.0] + - - [4096, 5120, 1, 3200] + - [1096, 0.0] + - - [4096, 5120, 1, 6400] + - [1097, 0.0] + - - [4096, 5120, 1, 8192] + - [1098, 0.0] + - - [4096, 5120, 1, 25600] + - [1099, 0.0] + - - [4096, 6144, 1, 1536] + - [1100, 0.0] + - - [4096, 6400, 1, 5120] + - [1101, 0.0] + - - [4096, 7168, 1, 512] + - [1102, 0.0] + - - [4096, 7168, 1, 2048] + - [1103, 0.0] + - - [4096, 7168, 1, 2304] + - [1104, 0.0] + - - [4096, 7168, 1, 4096] + - [1105, 0.0] + - - [4096, 7168, 1, 4608] + - [1106, 0.0] + - - [4096, 7168, 1, 8192] + - [1107, 0.0] + - - [4096, 7168, 1, 16384] + - [1108, 0.0] + - - [4096, 7168, 1, 18432] + - [1109, 0.0] + - - [4096, 8192, 1, 3584] + - [1110, 0.0] + - - [4096, 8192, 1, 7168] + - [1111, 0.0] + - - [4096, 8192, 1, 8192] + - [1112, 0.0] + - - [4096, 8192, 1, 28672] + - [1113, 0.0] + - - [4096, 9216, 1, 4096] + - [1114, 0.0] + - - [4096, 9216, 1, 7168] + - [1115, 0.0] + - - [4096, 10240, 1, 5120] + - [1116, 0.0] + - - [4096, 10240, 1, 8192] + - [1117, 0.0] + - - [4096, 12800, 1, 5120] + - [1118, 0.0] + - - [4096, 13312, 1, 16384] + - [1119, 0.0] + - - [4096, 14336, 1, 8192] + - [1120, 0.0] + - - [4096, 16384, 1, 2048] + - [1121, 0.0] + - - [4096, 16384, 1, 4096] + - [1122, 0.0] + - - [4096, 16384, 1, 6656] + - [1123, 0.0] + - - [4096, 16384, 1, 16384] + - [1124, 0.0] + - - [4096, 16384, 1, 53248] + - [1125, 0.0] + - - [4096, 18432, 1, 16384] + - [1126, 0.0] + - - [4096, 24576, 1, 1536] + - [1127, 0.0] + - - [4096, 26624, 1, 16384] + - [1128, 0.0] + - - [4096, 51200, 1, 5120] + - [1129, 0.0] + - - [4096, 106496, 1, 16384] + - [1130, 0.0] + - - [8192, 8192, 1, 37888] + - [1131, 0.0] + - - [1280, 264, 1, 1280] + - [1133, 0.0] + - - [5120, 264, 1, 1280] + - [1136, 0.0] + - - [5120, 264, 1, 5120] + - [1137, 0.0] + - - [5120, 9419, 1, 5120] + - [1138, 0.0] + - - [5120, 9419, 1, 13824] + - [1139, 0.0] + - - [5120, 9424, 1, 13824] + - [1140, 0.0] + - - [5120, 18389, 1, 5120] + - [1141, 0.0] + - - [5120, 18392, 1, 5120] + - [1142, 0.0] + - - [5120, 21090, 1, 5120] + - [1195, 0.0] + - - [5120, 21090, 1, 13824] + - [1196, 0.0] + - - [5120, 21096, 1, 5120] + - [1143, 0.0] + - - [5120, 21096, 1, 13824] + - [1144, 0.0] + - - [13824, 9419, 1, 5120] + - [1145, 0.0] + - - [13824, 9424, 1, 5120] + - [1146, 0.0] + - - [13824, 18389, 1, 5120] + - [1147, 0.0] + - - [13824, 18392, 1, 5120] + - [1148, 0.0] + - - [13824, 21090, 1, 5120] + - [1149, 0.0] + - - [13824, 21096, 1, 5120] + - [1150, 0.0] + - - [3072, 256, 1, 3072] + - [1151, 0.0] + - - [3072, 17, 1, 3072] + - [1152, 0.0] + - - [3072, 33677, 1, 15360] + - [1153, 0.0] + - - [3072, 33660, 1, 12288] + - [1154, 0.0] + - - [12288, 33677, 1, 3072] + - [1155, 0.0] + - - [12288, 33660, 1, 3072] + - [1156, 0.0] + - - [3072, 14867, 1, 3072] + - [1157, 0.0] + - - [3072, 14867, 1, 15360] + - [1158, 0.0] + - - [12288, 14867, 1, 3072] + - [1159, 0.0] + - - [3072, 14850, 1, 12288] + - [1160, 0.0] + - - [3072, 33660, 1, 3072] + - [1161, 0.0] + - - [64, 33660, 1, 3072] + - [1162, 0.0] + - - [3072, 33677, 1, 3072] + - [1163, 0.0] + - - [3072, 33664, 1, 12288] + - [1157, 0.0] + - - [3072, 33680, 1, 15360] + - [1158, 0.0] + - - [12288, 33680, 1, 3072] + - [1164, 0.0] + - - [12288, 33664, 1, 3072] + - [1165, 0.0] + - - [64, 14850, 1, 3072] + - [1166, 0.0] + - - [3072, 14856, 1, 12288] + - [1167, 0.0] + - - [12288, 14856, 1, 3072] + - [1168, 0.0] + - - [3072, 14872, 1, 15360] + - [1169, 0.0] + - - [12288, 14872, 1, 3072] + - [1170, 0.0] + - - [3072, 14850, 1, 3072] + - [1171, 0.0] + - - [16032, 128, 1, 16384] + - [1172, 0.0] + - - [16032, 64, 1, 16384] + - [1173, 0.0] + - - [16032, 16, 1, 16384] + - [1174, 0.0] + - - [16032, 8, 1, 16384] + - [1175, 0.0] + - - [16032, 32, 1, 16384] + - [1176, 0.0] + - - [128256, 8, 1, 8192] + - [1177, 0.0] + - - [128256, 16, 1, 8192] + - [1178, 0.0] + - - [128256, 64, 1, 8192] + - [1179, 0.0] + - - [128256, 128, 1, 8192] + - [1180, 0.0] + - - [128256, 32, 1, 8192] + - [1181, 0.0] + - - [128256, 124, 1, 8192] + - [1182, 0.0] + - - [128256, 127, 1, 8192] + - [1183, 0.0] + - - [128256, 125, 1, 8192] + - [1184, 0.0] + - - [128256, 126, 1, 8192] + - [1185, 0.0] + - - [128256, 123, 1, 8192] + - [1186, 0.0] + - - [128256, 122, 1, 8192] + - [1187, 0.0] + - - [5120, 4106, 1, 5120] + - [1191, 0.0] + - - [5120, 4106, 1, 13824] + - [1192, 0.0] + - - [5120, 4200, 1, 5120] + - [1193, 0.0] + - - [5120, 4200, 1, 13824] + - [1194, 0.0] + - - [5120, 21263, 1, 5120] + - [1197, 0.0] + - - [5120, 21263, 1, 13824] + - [1198, 0.0] + - - [5120, 21264, 1, 13824] + - [1199, 0.0] + - - [13824, 4106, 1, 5120] + - [1201, 0.0] + - - [13824, 4200, 1, 5120] + - [1202, 0.0] + - - [128, 18928, 1, 512] + - [1204, 0.0] + - - [1024, 150000, 1, 4096] + - [1205, 0.0] + - - [1134, 150000, 1, 2048] + - [1206, 0.0] + - - [2048, 150000, 1, 8192] + - [1207, 0.0] + - - [2268, 150000, 1, 4096] + - [1208, 0.0] + - - [4096, 150000, 1, 2048] + - [1209, 0.0] + - - [128, 32768, 1, 512] + - [1210, 0.0] + - - [128, 16800000, 1, 134] + - [1211, 0.0] + - - [576, 7680, 1, 576] + - [1212, 0.0] + - - [128, 14000000, 1, 134] + - [1213, 0.0] + - - [512, 7680, 1, 304] + - [1214, 0.0] + - - [512, 7680, 1, 512] + - [1215, 0.0] + - - [512, 7680, 1, 2048] + - [1216, 0.0] + - - [512, 26696, 1, 128] + - [1217, 0.0] + - - [576, 10, 1, 576] + - [1218, 0.0] + - - [576, 10, 1, 2304] + - [1219, 0.0] + - - [576, 1280, 1, 2304] + - [1220, 0.0] + - - [576, 1280, 1, 3840] + - [1221, 0.0] + - - [576, 7680, 1, 3840] + - [1222, 0.0] + - - [1024, 5, 1, 512] + - [1223, 0.0] + - - [1024, 5, 1, 1024] + - [1224, 0.0] + - - [1024, 125000, 1, 128] + - [1225, 0.0] + - - [20, 21760, 1, 3840] + - [1226, 0.0] + - - [1024, 125000, 1, 1024] + - [1227, 0.0] + - - [1152, 1280, 1, 576] + - [1228, 0.0] + - - [1728, 165, 1, 576] + - [1229, 0.0] + - - [1728, 1280, 1, 576] + - [1230, 0.0] + - - [1728, 7680, 1, 576] + - [1231, 0.0] + - - [2048, 1285, 1, 2048] + - [1232, 0.0] + - - [2048, 1285, 1, 3840] + - [1233, 0.0] + - - [2048, 1285, 1, 8192] + - [1234, 0.0] + - - [2048, 125000, 1, 8192] + - [1235, 0.0] + - - [2268, 125000, 1, 4096] + - [1236, 0.0] + - - [2304, 1280, 1, 576] + - [1237, 0.0] + - - [128, 26696, 1, 512] + - [1238, 0.0] + - - [3840, 1280, 1, 576] + - [1239, 0.0] + - - [3840, 1285, 1, 2048] + - [1240, 0.0] + - - [3840, 6400, 1, 3840] + - [1241, 0.0] + - - [3840, 7680, 1, 576] + - [1242, 0.0] + - - [3840, 11520, 1, 3840] + - [1243, 0.0] + - - [4096, 125000, 1, 1024] + - [1244, 0.0] + - - [4096, 125000, 1, 2048] + - [1245, 0.0] + - - [7680, 1280, 1, 3840] + - [1246, 0.0] + - - [96, 96, 10, 96] + - [1247, 0.0] + - - [128, 2989952, 1, 128] + - [1248, 0.0] + - - [16032, 5, 1, 16384] + - [1249, 0.0] + - - [32, 286720, 1, 257] + - [1250, 0.0] + - - [57, 491520, 1, 1219] + - [1252, 0.0] + - - [256, 196608, 1, 641] + - [1264, 0.0] + - - [256, 245760, 1, 641] + - [1265, 0.0] + - - [32, 2048, 1, 9984] + - [1279, 0.0] + - - [192, 170757, 1, 192] + - [1284, 0.0] + - - [192, 342315, 1, 448] + - [1285, 0.0] + - - [192, 359839, 1, 256] + - [1286, 0.0] + - - [256, 2048, 1, 15964] + - [1291, 0.0] + - - [256, 131072, 1, 128] + - [1293, 0.0] + - - [256, 131072, 1, 192] + - [1294, 0.0] + - - [512, 2048, 1, 9984] + - [1298, 0.0] + - - [512, 2048, 1, 10752] + - [1299, 0.0] + - - [768, 2048, 1, 15964] + - [1300, 0.0] + - - [1024, 2048, 1, 9984] + - [1302, 0.0] + - - [1024, 2048, 1, 10752] + - [1303, 0.0] + - - [1024, 2048, 1, 14336] + - [1304, 0.0] + - - [1024, 2048, 1, 32768] + - [1305, 0.0] + - - [1920, 2048, 1, 15964] + - [1306, 0.0] + - - [4096, 3840, 1, 8192] + - [1308, 0.0] + - - [4096, 3328, 1, 8192] + - [1309, 427.47] + - - [3584, 4096, 1, 8192] + - [1310, 0.0] + - - [3072, 4096, 1, 8192] + - [1311, 0.0] +- null +- null +- DeviceEfficiency +- Equality From 6c7223e735b14ad08f5c0cafedd1b9f0b4687b9d Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Tue, 25 Nov 2025 01:52:56 +0000 Subject: [PATCH 11/18] typo --- .../gfx950/custom_mainloop_scheduling.yaml | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index aed38a8acb8..dc6097d1f52 100755 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -776,7 +776,7 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] ForkParameters: - MatrixInstruction: - - [16, 16,32, 1, 1, 8, 6, 2,2 ] + - [16,16,32,1, 1, 8, 3, 2, 2] - PrefetchGlobalRead: [2] - PrefetchLocalRead: [1] - DepthU: [64] @@ -787,24 +787,27 @@ BenchmarkProblems: - GlobalReadVectorWidthA: [8] - GlobalReadVectorWidthB: [8] - DirectToLds: [1] + - LDSTrInst: [1] - StreamK: [3] - - LdsPadA: [8] #[-1] - - LdsPadB: [8] #[-1] + - LdsPadA: [-1] + - LdsPadB: [-1] - StaggerU: [0] - - WorkGroupMapping: [16] - - WorkGroupMappingXCC: [2] + #- WorkGroupMapping: [16] + #- WorkGroupMappingXCC: [2] - 1LDSBuffer: [0] - NonTemporalD: [4] - SourceSwap: [1] - UseSgprForGRO: [0] - - UseCustomMainLoopSchedule: [0, 1] + - UseCustomMainLoopSchedule: [1] + - MaxLDS: [163840] BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - - Range: [[256], [192], [1], [64, 64, 256]] - - Range: [[256], [192], [1], [1, 1, 64]] - - Range: [[256], [192], [1], [32, 64, 256]] - - Exact: [4096, 3072, 1, 8192] + - Range: [[256], [96], [1], [64, 64, 256]] + - Range: [[256], [96], [1], [1, 1, 64]] + - Range: [[256], [96], [1], [32, 64, 256]] + - Range: [[8192], [3072], [1], [64, 64, 256]] + - Exact: [4096, 1536, 1, 8192] # Compute bound - BiasTypeArgs: ['b'] - # BenchmarkProblemSizeGroup - Standard - All problem InitialSolutionParameters: @@ -812,7 +815,7 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] ForkParameters: - MatrixInstruction: - - [16,16,32,1, 1, 8, 3, 2, 2] + - [16, 16, 32, 1, 1, 5, 8, 2, 2] - PrefetchGlobalRead: [2] - PrefetchLocalRead: [1] - DepthU: [64] @@ -823,27 +826,25 @@ BenchmarkProblems: - GlobalReadVectorWidthA: [8] - GlobalReadVectorWidthB: [8] - DirectToLds: [1] - - LDSTrInst: [1] - StreamK: [3] - LdsPadA: [-1] - LdsPadB: [-1] - StaggerU: [0] - #- WorkGroupMapping: [16] - #- WorkGroupMappingXCC: [2] + - WorkGroupMapping: [0] + - WorkGroupMappingXCC: [-1] - 1LDSBuffer: [0] - NonTemporalD: [4] - SourceSwap: [1] - UseSgprForGRO: [0] - - UseCustomMainLoopSchedule: [1] - - MaxLDS: [163840] + - UseCustomMainLoopSchedule: [0, 1] BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - - Range: [[256], [96], [1], [64, 64, 256]] - - Range: [[256], [96], [1], [1, 1, 64]] - - Range: [[256], [96], [1], [32, 64, 256]] - - Range: [[8192], [3072], [1], [64, 64, 256]] - - Exact: [4096, 1536, 1, 8192] # Compute bound + - Range: [[160], [256], [1], [64, 64, 256]] + - Range: [[160], [256], [1], [1, 1, 64]] + - Range: [[160], [256], [1], [32, 64, 256]] + - Range: [[5120], [8192], [1], [64, 64, 256]] + - Exact: [2560, 4096, 1, 8192] - BiasTypeArgs: ['b'] - # BenchmarkProblemSizeGroup - Standard - All problem InitialSolutionParameters: @@ -851,7 +852,7 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] ForkParameters: - MatrixInstruction: - - [16, 16, 32, 1, 1, 5, 8, 2, 2] + - [16, 16,32, 1, 1, 6, 8, 2,2 ] - PrefetchGlobalRead: [2] - PrefetchLocalRead: [1] - DepthU: [64] @@ -863,11 +864,11 @@ BenchmarkProblems: - GlobalReadVectorWidthB: [8] - DirectToLds: [1] - StreamK: [3] - - LdsPadA: [-1] - - LdsPadB: [-1] + - LdsPadA: [8] #[-1] + - LdsPadB: [8] #[-1] - StaggerU: [0] - - WorkGroupMapping: [0] - - WorkGroupMappingXCC: [-1] + - WorkGroupMapping: [16] + - WorkGroupMappingXCC: [2] - 1LDSBuffer: [0] - NonTemporalD: [4] - SourceSwap: [1] @@ -876,14 +877,13 @@ BenchmarkProblems: BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - - Range: [[160], [256], [1], [64, 64, 256]] - - Range: [[160], [256], [1], [1, 1, 64]] - - Range: [[160], [256], [1], [32, 64, 256]] - - Range: [[5120], [8192], [1], [64, 64, 256]] - - Exact: [2560, 4096, 1, 8192] + - Range: [[192], [256], [1], [64, 64, 256]] + - Range: [[192], [256], [1], [1, 1, 64]] + - Range: [[192], [256], [1], [32, 64, 256]] + - Range: [[6144], [8192], [1], [64, 64, 256]] + - Exact: [3072, 4096, 1, 8192] - BiasTypeArgs: ['b'] - ######################################## # HHS TN - standard ######################################## From f9f9b0c15c4ac776a08ff0c382157939f4aa1d51 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Tue, 25 Nov 2025 01:54:57 +0000 Subject: [PATCH 12/18] typo --- ...k_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 246 ++++++++++++++++++ 1 file changed, 246 insertions(+) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index a19c9635c7b..b9f787e1361 100755 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -307803,6 +307803,250 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1iMO6yjTG4yuZPDkcMXrcObwuUOXV_ipLalV5INSQqYE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123776 + LdsInitCVgprs: false + LdsNumBytes: 123776 + LdsNumElementsAlignedA: 24960 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24960 + LdsOffsetB_Blk: 90496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24960 + LdsOffsetMetadata_Blk: 90496 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: 6 + NumTotalPackedLoadsB: 8 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1315 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false - [2, 3, 0, 1] - - - [16, 368640, 1, 224] - [0, 0.0] @@ -310456,6 +310700,8 @@ - [1313, 0.0] - - [2560, 4096, 1, 8192] - [1314, 0.0] + - - [3072, 4096, 1, 8192] + - [1315, 0.0] - null - null - DeviceEfficiency From 1c4f0c06823437f06645e75415bb2f136c1ff2ce Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Tue, 25 Nov 2025 01:58:53 +0000 Subject: [PATCH 13/18] typo --- .../Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index dc6097d1f52..5533e129d27 100755 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -733,6 +733,7 @@ BenchmarkProblems: - Range: [[8192], [7168], [1], [64, 64, 256]] - Exact: [4096, 3584, 1, 8192] - BiasTypeArgs: ['b'] + - # BenchmarkProblemSizeGroup - Standard - All problem InitialSolutionParameters: BenchmarkCommonParameters: @@ -769,7 +770,6 @@ BenchmarkProblems: - Range: [[256], [192], [1], [32, 64, 256]] - Exact: [4096, 3072, 1, 8192] - BiasTypeArgs: ['b'] - - # BenchmarkProblemSizeGroup - Standard - All problem InitialSolutionParameters: BenchmarkCommonParameters: @@ -846,6 +846,7 @@ BenchmarkProblems: - Range: [[5120], [8192], [1], [64, 64, 256]] - Exact: [2560, 4096, 1, 8192] - BiasTypeArgs: ['b'] + - # BenchmarkProblemSizeGroup - Standard - All problem InitialSolutionParameters: BenchmarkCommonParameters: From 84c119256a40c66ef937f22140c1bdaa0258702e Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Tue, 25 Nov 2025 10:33:51 +0000 Subject: [PATCH 14/18] rename --- ...ijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index b9f787e1361..db8c60e947b 100755 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -307805,7 +307805,7 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 @@ -307814,7 +307814,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1iMO6yjTG4yuZPDkcMXrcObwuUOXV_ipLalV5INSQqYE= + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_SAV_UserArgs_MT192x25GrlcXKC7t-UqBC8MTU-ba4yBkbzpcFIFSmhpwzdMe9w= BufferLoad: true BufferStore: true CUCount: null @@ -307857,7 +307857,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -307926,7 +307926,7 @@ MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 - MfmaInitCVgprs: false + MfmaInitCVgprs: true NoLdsWriteCode: true NoReject: false NoTailLoop: false @@ -307968,7 +307968,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1315 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -308046,7 +308046,7 @@ reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false - tailLoopOptB: false + tailLoopOptB: false - [2, 3, 0, 1] - - - [16, 368640, 1, 224] - [0, 0.0] From eed850a1a112f37e34f07552eb7d56c34996709d Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 26 Nov 2025 02:52:14 +0000 Subject: [PATCH 15/18] refine --- .../Tensile/Components/CustomSchedule.py | 51 +++++++++---------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 2ae5aaf09f8..655df3eca2d 100755 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -403,42 +403,37 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): elif isTN(kernel) and not useLDSTr and TLDS == 1: #index and code pair syncTable = [-1, SWaitCnt(dscnt=7, vlcnt=-1, vscnt=-1, comment="for LRB1-0"), - 5, SWaitCnt(dscnt=6+4, vlcnt=-1, vscnt=-1, comment="for LRB1-1"), - 8, SBarrier(comment="for GRA start"), - 11, SWaitCnt(dscnt=5+8, vlcnt=-1, vscnt=-1, comment="for LRB1-2"), - 17, SWaitCnt(dscnt=4+11, vlcnt=-1, vscnt=-1, comment="for LRB1-3"), - 23, SWaitCnt(dscnt=15, vlcnt=-1, vscnt=-1, comment="for LRB1-4:6"), - 41, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="for LRB1-7"), - 46, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRA1"), - 47, SWaitCnt(dscnt=7, vlcnt=-1, vscnt=-1, comment="for LRB0-0"), - 48, SBarrier(comment="for LRA1 start"), - 51, SWaitCnt(dscnt=1, vlcnt=-1, vscnt=-1, comment="for LRB1"), - 78, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRB0"), - 78, SBarrier(comment="for LRB1 start"),] + 5, SWaitCnt(dscnt=6+5, vlcnt=-1, vscnt=-1, comment="for LRB1"), + 14, SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="for LRA0 complete"), + 14, SBarrier(comment="for GRA start"), + 47, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="for LRB0"), + 50, SWaitCnt(dscnt=-1, vlcnt=14+1, vscnt=-1, comment="for LRA1"), + 50, SBarrier(comment="for LRA1 start"), + 65, SWaitCnt(dscnt=-1, vlcnt=6+5, vscnt=-1, comment="for LRB0"), + 65, SBarrier(comment="for LRB1 start"),] optSchedule = { 'SYNC' : [syncTable[::2]], - 'GRIncA': [[0,1,2,3,4,5,6,7,8]], + 'GRIncA': [[6,6,7,7,8,8,9,9,9]], 'GRIncB': [[9,10,11,12,13,14,15,16,17]], - 'LRA0' : [[0, 2, 3, 4, 5, 6]], + 'LRA0' : [[0, 1, 2, 3, 4, 5], + [-1, 0, 1, 2, 3, 4]], 'LRB0' : [[7, 9, 11, 13, 15, 17, 19, 21], - [8, 10, 12, 14, 16, 18, 20, 22]], - 'GRA' : [[8,8, 10,10, 12,12, 14,14, 26,26, 31,31], - [9,9, 11,11, 13,13, 15,15, 27,27, 32,32]], + [8, 10, 12, 13, 16, 18, 20, 22]], + 'GRA' : [[14,14, 16,16, 18,18, 20,20, 25,25, 31,31], + [15,15, 17,17, 19,19, 21,21, 26,26, 32,32]], 'GRB' : [[46,46, 50,50, 54,54, 58,58, 62,62, 66,66, 70,70, 76,76], [47,47, 51,51, 55,55, 59,59, 63,63, 67,67, 71,71, 77,77]], - 'LRA1' : [[48, 52, 56, 58, 60, 64], - [49, 53, 57, 59, 61, 65]], - # 0 1 2 3 4 5 6 7 - 'LRB1' : [[78, 80, 82, 84, 86, 90, 92, 94], - [79, 81, 83, 85, 87, 91, 93, 95]], - - 'LRSA' : [[22]], - 'LRSB' : [[23]], - - 'LWSA' : [[20]], - 'LWSB' : [[78]], + 'LRA1' : [[50, 52, 56, 58, 60, 62], + [51, 53, 57, 59, 61, 63]], + 'LRB1' : [[65, 67, 69, 71, 73, 75, 77, 79], + [66, 68, 70, 72, 74, 76, 78, 80]], + + 'LRSA' : [[47]], + 'LRSB' : [[47]], + 'LWSA' : [[47]], + 'LWSB' : [[80]], 'LCC' : [[95, 95]], } syncCode = syncTable[1::2] From efda36c466b3408699605a4e4d7c77ff9885f509 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Fri, 28 Nov 2025 03:40:36 +0000 Subject: [PATCH 16/18] typo --- .../tensilelite/Tensile/Components/CustomSchedule.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 2d244804a0f..267cc0873dc 100755 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -403,10 +403,10 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): elif isTN(kernel) and not useLDSTr and TLDS == 1: #index and code pair syncTable = [-1, SWaitCnt(dscnt=7, vlcnt=-1, vscnt=-1, comment="for LRB1-0"), - 5, SWaitCnt(dscnt=6+5, vlcnt=-1, vscnt=-1, comment="for LRB1"), + 5, SWaitCnt(dscnt=5, vlcnt=-1, vscnt=-1, comment="for LRB1"), 14, SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="for LRA0 complete"), 14, SBarrier(comment="for GRA start"), - 47, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="for LRB0"), + 46, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="for LRB0"), 50, SWaitCnt(dscnt=-1, vlcnt=14+1, vscnt=-1, comment="for LRA1"), 50, SBarrier(comment="for LRA1 start"), 65, SWaitCnt(dscnt=-1, vlcnt=6+5, vscnt=-1, comment="for LRB0"), From c4b1d4a5986ab42b8c4ecdad912f9548a9c45426 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Fri, 28 Nov 2025 06:00:35 +0000 Subject: [PATCH 17/18] disable LDSTrInst in test --- .../Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index 3dd7b7267ae..705b499953e 100755 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -1068,6 +1068,7 @@ BenchmarkProblems: - ScheduleIterAlg: [3] - ExpandPointerSwap: [0] - TransposeLDS: [1] #0,1 + - LDSTrInst: [0] - LocalReadVectorWidth: [8] - GlobalReadVectorWidthA: [8] - GlobalReadVectorWidthB: [8] From 77c3ebdb94ea33da4e38f0e1e2ec905bd0e78dee Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Mon, 1 Dec 2025 07:02:44 +0000 Subject: [PATCH 18/18] refine --- .../gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 2 +- .../hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index cb2a32a7d12..12b9b14d5e8 100755 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -308455,7 +308455,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 1316 + SolutionIndex: 1317 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 01c171feac9..f8ba29aefd2 100755 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -496,6 +496,7 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): 14, SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="for LRA0 complete"), 14, SBarrier(comment="for GRA start"), 46, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="for LRB0"), + 46, SBarrier(comment="for GRB start"), 50, SWaitCnt(dscnt=-1, vlcnt=14+1, vscnt=-1, comment="for LRA1"), 50, SBarrier(comment="for LRA1 start"), 65, SWaitCnt(dscnt=-1, vlcnt=6+5, vscnt=-1, comment="for LRB0"),